aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorStephen Rothwell <sfr@canb.auug.org.au>2021-06-09 15:12:15 +1000
committerStephen Rothwell <sfr@canb.auug.org.au>2021-06-09 15:12:15 +1000
commitcca935aa8c5be48bf44b6a05edfe9adcb556a34e (patch)
treee44edcdf130d146942585d137c7caa0393fc8f3f
parent3916143235117650889ef6d8e470eb6d9ae11d49 (diff)
parent5587a2080684547b08b76e7817d192177b4f49f6 (diff)
downloadlinux-next-cca935aa8c5be48bf44b6a05edfe9adcb556a34e.tar.gz
Merge remote-tracking branch 'rcu/rcu/next'
Notice: this object is not reachable from any branch.
Notice: this object is not reachable from any branch.
-rw-r--r--Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst6
-rw-r--r--Documentation/RCU/Design/Requirements/Requirements.rst8
-rw-r--r--Documentation/RCU/checklist.rst24
-rw-r--r--Documentation/RCU/rcu_dereference.rst6
-rw-r--r--Documentation/RCU/stallwarn.rst10
-rw-r--r--Documentation/admin-guide/kernel-parameters.rst5
-rw-r--r--Documentation/admin-guide/kernel-parameters.txt27
-rw-r--r--Documentation/dev-tools/kcsan.rst93
-rw-r--r--Documentation/litmus-tests/locking/DCL-broken.litmus55
-rw-r--r--Documentation/litmus-tests/locking/DCL-fixed.litmus56
-rw-r--r--Documentation/litmus-tests/locking/RM-broken.litmus42
-rw-r--r--Documentation/litmus-tests/locking/RM-fixed.litmus42
-rw-r--r--arch/x86/kernel/tsc.c4
-rw-r--r--include/linux/clocksource.h8
-rw-r--r--include/linux/rculist.h35
-rw-r--r--include/linux/rcupdate.h86
-rw-r--r--include/linux/rcutiny.h4
-rw-r--r--include/linux/rcutree.h1
-rw-r--r--include/linux/srcu.h6
-rw-r--r--include/linux/srcutiny.h8
-rw-r--r--include/linux/srcutree.h2
-rw-r--r--include/linux/timer.h2
-rw-r--r--include/trace/events/rcu.h1
-rw-r--r--init/main.c2
-rw-r--r--kernel/kcsan/core.c53
-rw-r--r--kernel/kcsan/kcsan.h39
-rw-r--r--kernel/kcsan/report.c169
-rw-r--r--kernel/locking/lockdep.c6
-rw-r--r--kernel/locking/locktorture.c25
-rw-r--r--kernel/rcu/Kconfig.debug2
-rw-r--r--kernel/rcu/rcu.h14
-rw-r--r--kernel/rcu/rcutorture.c322
-rw-r--r--kernel/rcu/refscale.c144
-rw-r--r--kernel/rcu/srcutiny.c2
-rw-r--r--kernel/rcu/srcutree.c28
-rw-r--r--kernel/rcu/sync.c4
-rw-r--r--kernel/rcu/tasks.h94
-rw-r--r--kernel/rcu/tiny.c1
-rw-r--r--kernel/rcu/tree.c422
-rw-r--r--kernel/rcu/tree.h16
-rw-r--r--kernel/rcu/tree_nocb.h1496
-rw-r--r--kernel/rcu/tree_plugin.h1527
-rw-r--r--kernel/rcu/tree_stall.h165
-rw-r--r--kernel/rcu/update.c8
-rw-r--r--kernel/time/Makefile1
-rw-r--r--kernel/time/clocksource-wdtest.c202
-rw-r--r--kernel/time/clocksource.c226
-rw-r--r--kernel/time/jiffies.c15
-rw-r--r--kernel/time/timer.c14
-rw-r--r--lib/Kconfig.debug12
-rw-r--r--lib/bitmap.c9
-rw-r--r--lib/test_bitmap.c7
-rw-r--r--mm/oom_kill.c2
-rw-r--r--mm/slab.h1
-rw-r--r--mm/slab_common.c12
-rw-r--r--mm/slub.c8
-rw-r--r--mm/util.c2
-rw-r--r--tools/memory-model/Documentation/access-marking.txt152
-rw-r--r--tools/memory-model/Documentation/explanation.txt2
-rw-r--r--tools/memory-model/Documentation/locking.txt320
-rw-r--r--tools/memory-model/litmus-tests/.gitignore2
-rw-r--r--tools/memory-model/scripts/README16
-rwxr-xr-xtools/memory-model/scripts/checkalllitmus.sh29
-rwxr-xr-xtools/memory-model/scripts/checkghlitmus.sh11
-rwxr-xr-xtools/memory-model/scripts/checklitmus.sh25
-rwxr-xr-xtools/memory-model/scripts/checklitmushist.sh2
-rwxr-xr-xtools/memory-model/scripts/checktheselitmus.sh43
-rwxr-xr-xtools/memory-model/scripts/cmplitmushist.sh49
-rwxr-xr-xtools/memory-model/scripts/hwfnseg.sh20
-rwxr-xr-xtools/memory-model/scripts/initlitmushist.sh2
-rwxr-xr-xtools/memory-model/scripts/judgelitmus.sh120
-rwxr-xr-xtools/memory-model/scripts/newlitmushist.sh4
-rwxr-xr-xtools/memory-model/scripts/parseargs.sh21
-rwxr-xr-xtools/memory-model/scripts/runlitmus.sh80
-rwxr-xr-xtools/memory-model/scripts/runlitmushist.sh29
-rwxr-xr-xtools/memory-model/scripts/simpletest.sh35
-rw-r--r--tools/rcu/rcu-cbs.py46
-rwxr-xr-xtools/testing/selftests/rcutorture/bin/kcsan-collapse.sh2
-rwxr-xr-xtools/testing/selftests/rcutorture/bin/kvm-again.sh33
-rwxr-xr-xtools/testing/selftests/rcutorture/bin/kvm-build.sh6
-rwxr-xr-xtools/testing/selftests/rcutorture/bin/kvm-end-run-stats.sh40
-rwxr-xr-xtools/testing/selftests/rcutorture/bin/kvm-find-errors.sh2
-rwxr-xr-xtools/testing/selftests/rcutorture/bin/kvm-recheck-rcu.sh2
-rwxr-xr-xtools/testing/selftests/rcutorture/bin/kvm-remote.sh249
-rwxr-xr-xtools/testing/selftests/rcutorture/bin/kvm.sh61
-rwxr-xr-xtools/testing/selftests/rcutorture/bin/torture.sh39
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcu/BUSTED-BOOST17
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcu/BUSTED-BOOST.boot8
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcuscale/TREE2
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcuscale/TREE542
-rw-r--r--tools/testing/selftests/rcutorture/configs/refscale/NOPREEMPT2
-rw-r--r--tools/testing/selftests/rcutorture/configs/refscale/PREEMPT2
-rw-r--r--tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/locks.h2
93 files changed, 4598 insertions, 2460 deletions
diff --git a/Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst b/Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst
index a648b423ba0eb..11cdab037bff6 100644
--- a/Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst
+++ b/Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst
@@ -21,7 +21,7 @@ Any code that happens after the end of a given RCU grace period is guaranteed
to see the effects of all accesses prior to the beginning of that grace
period that are within RCU read-side critical sections.
Similarly, any code that happens before the beginning of a given RCU grace
-period is guaranteed to see the effects of all accesses following the end
+period is guaranteed to not see the effects of all accesses following the end
of that grace period that are within RCU read-side critical sections.
Note well that RCU-sched read-side critical sections include any region
@@ -339,14 +339,14 @@ The diagram below shows the path of ordering if the leftmost
leftmost ``rcu_node`` structure offlines its last CPU and if the next
``rcu_node`` structure has no online CPUs).
-.. kernel-figure:: TreeRCU-gp-init-1.svg
+.. kernel-figure:: TreeRCU-gp-init-2.svg
The final ``rcu_gp_init()`` pass through the ``rcu_node`` tree traverses
breadth-first, setting each ``rcu_node`` structure's ``->gp_seq`` field
to the newly advanced value from the ``rcu_state`` structure, as shown
in the following diagram.
-.. kernel-figure:: TreeRCU-gp-init-1.svg
+.. kernel-figure:: TreeRCU-gp-init-3.svg
This change will also cause each CPU's next call to
``__note_gp_changes()`` to notice that a new grace period has started,
diff --git a/Documentation/RCU/Design/Requirements/Requirements.rst b/Documentation/RCU/Design/Requirements/Requirements.rst
index 38a39476fc248..45278e2974c04 100644
--- a/Documentation/RCU/Design/Requirements/Requirements.rst
+++ b/Documentation/RCU/Design/Requirements/Requirements.rst
@@ -362,9 +362,8 @@ do_something_gp() uses rcu_dereference() to fetch from ``gp``:
12 }
The rcu_dereference() uses volatile casts and (for DEC Alpha) memory
-barriers in the Linux kernel. Should a `high-quality implementation of
-C11 ``memory_order_consume``
-[PDF] <http://www.rdrop.com/users/paulmck/RCU/consume.2015.07.13a.pdf>`__
+barriers in the Linux kernel. Should a |high-quality implementation of
+C11 memory_order_consume [PDF]|_
ever appear, then rcu_dereference() could be implemented as a
``memory_order_consume`` load. Regardless of the exact implementation, a
pointer fetched by rcu_dereference() may not be used outside of the
@@ -374,6 +373,9 @@ element has been passed from RCU to some other synchronization
mechanism, most commonly locking or `reference
counting <https://www.kernel.org/doc/Documentation/RCU/rcuref.txt>`__.
+.. |high-quality implementation of C11 memory_order_consume [PDF]| replace:: high-quality implementation of C11 ``memory_order_consume`` [PDF]
+.. _high-quality implementation of C11 memory_order_consume [PDF]: http://www.rdrop.com/users/paulmck/RCU/consume.2015.07.13a.pdf
+
In short, updaters use rcu_assign_pointer() and readers use
rcu_dereference(), and these two RCU API elements work together to
ensure that readers have a consistent view of newly added data elements.
diff --git a/Documentation/RCU/checklist.rst b/Documentation/RCU/checklist.rst
index 1030119294d08..a71d3f1343237 100644
--- a/Documentation/RCU/checklist.rst
+++ b/Documentation/RCU/checklist.rst
@@ -37,7 +37,7 @@ over a rather long period of time, but improvements are always welcome!
1. Does the update code have proper mutual exclusion?
- RCU does allow -readers- to run (almost) naked, but -writers- must
+ RCU does allow *readers* to run (almost) naked, but *writers* must
still use some sort of mutual exclusion, such as:
a. locking,
@@ -73,7 +73,7 @@ over a rather long period of time, but improvements are always welcome!
critical section is every bit as bad as letting them leak out
from under a lock. Unless, of course, you have arranged some
other means of protection, such as a lock or a reference count
- -before- letting them out of the RCU read-side critical section.
+ *before* letting them out of the RCU read-side critical section.
3. Does the update code tolerate concurrent accesses?
@@ -101,7 +101,7 @@ over a rather long period of time, but improvements are always welcome!
c. Make updates appear atomic to readers. For example,
pointer updates to properly aligned fields will
appear atomic, as will individual atomic primitives.
- Sequences of operations performed under a lock will -not-
+ Sequences of operations performed under a lock will *not*
appear to be atomic to RCU readers, nor will sequences
of multiple atomic primitives.
@@ -320,7 +320,7 @@ over a rather long period of time, but improvements are always welcome!
for example) may be omitted.
10. Conversely, if you are in an RCU read-side critical section,
- and you don't hold the appropriate update-side lock, you -must-
+ and you don't hold the appropriate update-side lock, you *must*
use the "_rcu()" variants of the list macros. Failing to do so
will break Alpha, cause aggressive compilers to generate bad code,
and confuse people trying to read your code.
@@ -346,12 +346,12 @@ over a rather long period of time, but improvements are always welcome!
callback pending, then that RCU callback will execute on some
surviving CPU. (If this was not the case, a self-spawning RCU
callback would prevent the victim CPU from ever going offline.)
- Furthermore, CPUs designated by rcu_nocbs= might well -always-
+ Furthermore, CPUs designated by rcu_nocbs= might well *always*
have their RCU callbacks executed on some other CPUs, in fact,
for some real-time workloads, this is the whole point of using
the rcu_nocbs= kernel boot parameter.
-13. Unlike other forms of RCU, it -is- permissible to block in an
+13. Unlike other forms of RCU, it *is* permissible to block in an
SRCU read-side critical section (demarked by srcu_read_lock()
and srcu_read_unlock()), hence the "SRCU": "sleepable RCU".
Please note that if you don't need to sleep in read-side critical
@@ -398,16 +398,16 @@ over a rather long period of time, but improvements are always welcome!
14. The whole point of call_rcu(), synchronize_rcu(), and friends
is to wait until all pre-existing readers have finished before
carrying out some otherwise-destructive operation. It is
- therefore critically important to -first- remove any path
+ therefore critically important to *first* remove any path
that readers can follow that could be affected by the
- destructive operation, and -only- -then- invoke call_rcu(),
+ destructive operation, and *only then* invoke call_rcu(),
synchronize_rcu(), or friends.
Because these primitives only wait for pre-existing readers, it
is the caller's responsibility to guarantee that any subsequent
readers will execute safely.
-15. The various RCU read-side primitives do -not- necessarily contain
+15. The various RCU read-side primitives do *not* necessarily contain
memory barriers. You should therefore plan for the CPU
and the compiler to freely reorder code into and out of RCU
read-side critical sections. It is the responsibility of the
@@ -446,8 +446,8 @@ over a rather long period of time, but improvements are always welcome!
pass in a function defined within a loadable module, then it in
necessary to wait for all pending callbacks to be invoked after
the last invocation and before unloading that module. Note that
- it is absolutely -not- sufficient to wait for a grace period!
- The current (say) synchronize_rcu() implementation is -not-
+ it is absolutely *not* sufficient to wait for a grace period!
+ The current (say) synchronize_rcu() implementation is *not*
guaranteed to wait for callbacks registered on other CPUs.
Or even on the current CPU if that CPU recently went offline
and came back online.
@@ -457,7 +457,7 @@ over a rather long period of time, but improvements are always welcome!
- call_rcu() -> rcu_barrier()
- call_srcu() -> srcu_barrier()
- However, these barrier functions are absolutely -not- guaranteed
+ However, these barrier functions are absolutely *not* guaranteed
to wait for a grace period. In fact, if there are no call_rcu()
callbacks waiting anywhere in the system, rcu_barrier() is within
its rights to return immediately.
diff --git a/Documentation/RCU/rcu_dereference.rst b/Documentation/RCU/rcu_dereference.rst
index f3e587acb4deb..0b418a5b243c5 100644
--- a/Documentation/RCU/rcu_dereference.rst
+++ b/Documentation/RCU/rcu_dereference.rst
@@ -43,7 +43,7 @@ Follow these rules to keep your RCU code working properly:
- Set bits and clear bits down in the must-be-zero low-order
bits of that pointer. This clearly means that the pointer
must have alignment constraints, for example, this does
- -not- work in general for char* pointers.
+ *not* work in general for char* pointers.
- XOR bits to translate pointers, as is done in some
classic buddy-allocator algorithms.
@@ -174,7 +174,7 @@ Follow these rules to keep your RCU code working properly:
Please see the "CONTROL DEPENDENCIES" section of
Documentation/memory-barriers.txt for more details.
- - The pointers are not equal -and- the compiler does
+ - The pointers are not equal *and* the compiler does
not have enough information to deduce the value of the
pointer. Note that the volatile cast in rcu_dereference()
will normally prevent the compiler from knowing too much.
@@ -360,7 +360,7 @@ in turn destroying the ordering between this load and the loads of the
return values. This can result in "p->b" returning pre-initialization
garbage values.
-In short, rcu_dereference() is -not- optional when you are going to
+In short, rcu_dereference() is *not* optional when you are going to
dereference the resulting pointer.
diff --git a/Documentation/RCU/stallwarn.rst b/Documentation/RCU/stallwarn.rst
index 7148e9be08c34..f1c49c626e934 100644
--- a/Documentation/RCU/stallwarn.rst
+++ b/Documentation/RCU/stallwarn.rst
@@ -32,7 +32,7 @@ warnings:
- Booting Linux using a console connection that is too slow to
keep up with the boot-time console-message rate. For example,
- a 115Kbaud serial console can be -way- too slow to keep up
+ a 115Kbaud serial console can be *way* too slow to keep up
with boot-time message rates, and will frequently result in
RCU CPU stall warning messages. Especially if you have added
debug printk()s.
@@ -105,7 +105,7 @@ warnings:
leading the realization that the CPU had failed.
The RCU, RCU-sched, and RCU-tasks implementations have CPU stall warning.
-Note that SRCU does -not- have CPU stall warnings. Please note that
+Note that SRCU does *not* have CPU stall warnings. Please note that
RCU only detects CPU stalls when there is a grace period in progress.
No grace period, no CPU stall warnings.
@@ -145,7 +145,7 @@ CONFIG_RCU_CPU_STALL_TIMEOUT
this parameter is checked only at the beginning of a cycle.
So if you are 10 seconds into a 40-second stall, setting this
sysfs parameter to (say) five will shorten the timeout for the
- -next- stall, or the following warning for the current stall
+ *next* stall, or the following warning for the current stall
(assuming the stall lasts long enough). It will not affect the
timing of the next warning for the current stall.
@@ -202,7 +202,7 @@ causing stalls, and that the stall was affecting RCU-sched. This message
will normally be followed by stack dumps for each CPU. Please note that
PREEMPT_RCU builds can be stalled by tasks as well as by CPUs, and that
the tasks will be indicated by PID, for example, "P3421". It is even
-possible for an rcu_state stall to be caused by both CPUs -and- tasks,
+possible for an rcu_state stall to be caused by both CPUs *and* tasks,
in which case the offending CPUs and tasks will all be called out in the list.
CPU 2's "(3 GPs behind)" indicates that this CPU has not interacted with
@@ -224,7 +224,7 @@ is the number that had executed since boot at the time that this CPU
last noted the beginning of a grace period, which might be the current
(stalled) grace period, or it might be some earlier grace period (for
example, if the CPU might have been in dyntick-idle mode for an extended
-time period. The number after the "/" is the number that have executed
+time period). The number after the "/" is the number that have executed
since boot until the current time. If this latter number stays constant
across repeated stall-warning messages, it is possible that RCU's softirq
handlers are no longer able to execute on this CPU. This can happen if
diff --git a/Documentation/admin-guide/kernel-parameters.rst b/Documentation/admin-guide/kernel-parameters.rst
index 3996b54158bf5..01ba293a2d70f 100644
--- a/Documentation/admin-guide/kernel-parameters.rst
+++ b/Documentation/admin-guide/kernel-parameters.rst
@@ -76,6 +76,11 @@ to change, such as less cores in the CPU list, then N and any ranges using N
will also change. Use the same on a small 4 core system, and "16-N" becomes
"16-3" and now the same boot input will be flagged as invalid (start > end).
+The special case-tolerant group name "all" has a meaning of selecting all CPUs,
+so that "nohz_full=all" is the equivalent of "nohz_full=0-N".
+
+The semantics of "N" and "all" is supported on a level of bitmaps and holds for
+all users of bitmap_parse().
This document may not be entirely up to date and comprehensive. The command
"modinfo -p ${modulename}" shows a current list of all parameters of a loadable
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index b7c8e7424e52e..4b92bb850730d 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -584,6 +584,28 @@
loops can be debugged more effectively on production
systems.
+ clocksource.max_cswd_read_retries= [KNL]
+ Number of clocksource_watchdog() retries due to
+ external delays before the clock will be marked
+ unstable. Defaults to three retries, that is,
+ four attempts to read the clock under test.
+
+ clocksource.verify_n_cpus= [KNL]
+ Limit the number of CPUs checked for clocksources
+ marked with CLOCK_SOURCE_VERIFY_PERCPU that
+ are marked unstable due to excessive skew.
+ A negative value says to check all CPUs, while
+ zero says not to check any. Values larger than
+ nr_cpu_ids are silently truncated to nr_cpu_ids.
+ The actual CPUs are chosen randomly, with
+ no replacement if the same CPU is chosen twice.
+
+ clocksource-wdtest.holdoff= [KNL]
+ Set the time in seconds that the clocksource
+ watchdog test waits before commencing its tests.
+ Defaults to zero when built as a module and to
+ 10 seconds when built into the kernel.
+
clearcpuid=BITNUM[,BITNUM...] [X86]
Disable CPUID feature X for the kernel. See
arch/x86/include/asm/cpufeatures.h for the valid bit
@@ -4296,6 +4318,11 @@
whole algorithm to behave better in low memory
condition.
+ rcutree.rcu_delay_page_cache_fill_msec= [KNL]
+ Set the page-cache refill delay (in milliseconds)
+ in response to low-memory conditions. The range
+ of permitted values is in the range 0:100000.
+
rcutree.jiffies_till_first_fqs= [KNL]
Set delay from grace-period initialization to
first attempt to force quiescent states.
diff --git a/Documentation/dev-tools/kcsan.rst b/Documentation/dev-tools/kcsan.rst
index d85ce238ace78..6a600cf8430b1 100644
--- a/Documentation/dev-tools/kcsan.rst
+++ b/Documentation/dev-tools/kcsan.rst
@@ -27,75 +27,57 @@ Error reports
A typical data race report looks like this::
==================================================================
- BUG: KCSAN: data-race in generic_permission / kernfs_refresh_inode
-
- write to 0xffff8fee4c40700c of 4 bytes by task 175 on cpu 4:
- kernfs_refresh_inode+0x70/0x170
- kernfs_iop_permission+0x4f/0x90
- inode_permission+0x190/0x200
- link_path_walk.part.0+0x503/0x8e0
- path_lookupat.isra.0+0x69/0x4d0
- filename_lookup+0x136/0x280
- user_path_at_empty+0x47/0x60
- vfs_statx+0x9b/0x130
- __do_sys_newlstat+0x50/0xb0
- __x64_sys_newlstat+0x37/0x50
- do_syscall_64+0x85/0x260
- entry_SYSCALL_64_after_hwframe+0x44/0xa9
-
- read to 0xffff8fee4c40700c of 4 bytes by task 166 on cpu 6:
- generic_permission+0x5b/0x2a0
- kernfs_iop_permission+0x66/0x90
- inode_permission+0x190/0x200
- link_path_walk.part.0+0x503/0x8e0
- path_lookupat.isra.0+0x69/0x4d0
- filename_lookup+0x136/0x280
- user_path_at_empty+0x47/0x60
- do_faccessat+0x11a/0x390
- __x64_sys_access+0x3c/0x50
- do_syscall_64+0x85/0x260
- entry_SYSCALL_64_after_hwframe+0x44/0xa9
+ BUG: KCSAN: data-race in test_kernel_read / test_kernel_write
+
+ write to 0xffffffffc009a628 of 8 bytes by task 487 on cpu 0:
+ test_kernel_write+0x1d/0x30
+ access_thread+0x89/0xd0
+ kthread+0x23e/0x260
+ ret_from_fork+0x22/0x30
+
+ read to 0xffffffffc009a628 of 8 bytes by task 488 on cpu 6:
+ test_kernel_read+0x10/0x20
+ access_thread+0x89/0xd0
+ kthread+0x23e/0x260
+ ret_from_fork+0x22/0x30
+
+ value changed: 0x00000000000009a6 -> 0x00000000000009b2
Reported by Kernel Concurrency Sanitizer on:
- CPU: 6 PID: 166 Comm: systemd-journal Not tainted 5.3.0-rc7+ #1
- Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.12.0-1 04/01/2014
+ CPU: 6 PID: 488 Comm: access_thread Not tainted 5.12.0-rc2+ #1
+ Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-2 04/01/2014
==================================================================
The header of the report provides a short summary of the functions involved in
the race. It is followed by the access types and stack traces of the 2 threads
-involved in the data race.
+involved in the data race. If KCSAN also observed a value change, the observed
+old value and new value are shown on the "value changed" line respectively.
The other less common type of data race report looks like this::
==================================================================
- BUG: KCSAN: data-race in e1000_clean_rx_irq+0x551/0xb10
-
- race at unknown origin, with read to 0xffff933db8a2ae6c of 1 bytes by interrupt on cpu 0:
- e1000_clean_rx_irq+0x551/0xb10
- e1000_clean+0x533/0xda0
- net_rx_action+0x329/0x900
- __do_softirq+0xdb/0x2db
- irq_exit+0x9b/0xa0
- do_IRQ+0x9c/0xf0
- ret_from_intr+0x0/0x18
- default_idle+0x3f/0x220
- arch_cpu_idle+0x21/0x30
- do_idle+0x1df/0x230
- cpu_startup_entry+0x14/0x20
- rest_init+0xc5/0xcb
- arch_call_rest_init+0x13/0x2b
- start_kernel+0x6db/0x700
+ BUG: KCSAN: data-race in test_kernel_rmw_array+0x71/0xd0
+
+ race at unknown origin, with read to 0xffffffffc009bdb0 of 8 bytes by task 515 on cpu 2:
+ test_kernel_rmw_array+0x71/0xd0
+ access_thread+0x89/0xd0
+ kthread+0x23e/0x260
+ ret_from_fork+0x22/0x30
+
+ value changed: 0x0000000000002328 -> 0x0000000000002329
Reported by Kernel Concurrency Sanitizer on:
- CPU: 0 PID: 0 Comm: swapper/0 Not tainted 5.3.0-rc7+ #2
- Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.12.0-1 04/01/2014
+ CPU: 2 PID: 515 Comm: access_thread Not tainted 5.12.0-rc2+ #1
+ Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-2 04/01/2014
==================================================================
This report is generated where it was not possible to determine the other
racing thread, but a race was inferred due to the data value of the watched
-memory location having changed. These can occur either due to missing
-instrumentation or e.g. DMA accesses. These reports will only be generated if
-``CONFIG_KCSAN_REPORT_RACE_UNKNOWN_ORIGIN=y`` (selected by default).
+memory location having changed. These reports always show a "value changed"
+line. A common reason for reports of this type are missing instrumentation in
+the racing thread, but could also occur due to e.g. DMA accesses. Such reports
+are shown only if ``CONFIG_KCSAN_REPORT_RACE_UNKNOWN_ORIGIN=y``, which is
+enabled by default.
Selective analysis
~~~~~~~~~~~~~~~~~~
@@ -106,7 +88,8 @@ the below options are available:
* KCSAN understands the ``data_race(expr)`` annotation, which tells KCSAN that
any data races due to accesses in ``expr`` should be ignored and resulting
- behaviour when encountering a data race is deemed safe.
+ behaviour when encountering a data race is deemed safe. Please see
+ `"Marking Shared-Memory Accesses" in the LKMM`_ for more information.
* Disabling data race detection for entire functions can be accomplished by
using the function attribute ``__no_kcsan``::
@@ -128,6 +111,8 @@ the below options are available:
KCSAN_SANITIZE := n
+.. _"Marking Shared-Memory Accesses" in the LKMM: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/tools/memory-model/Documentation/access-marking.txt
+
Furthermore, it is possible to tell KCSAN to show or hide entire classes of
data races, depending on preferences. These can be changed via the following
Kconfig options:
diff --git a/Documentation/litmus-tests/locking/DCL-broken.litmus b/Documentation/litmus-tests/locking/DCL-broken.litmus
new file mode 100644
index 0000000000000..cfaa25ff82b1e
--- /dev/null
+++ b/Documentation/litmus-tests/locking/DCL-broken.litmus
@@ -0,0 +1,55 @@
+C DCL-broken
+
+(*
+ * Result: Sometimes
+ *
+ * This litmus test demonstrates more than just locking is required to
+ * correctly implement double-checked locking.
+ *)
+
+{
+ int flag;
+ int data;
+ int lck;
+}
+
+P0(int *flag, int *data, int *lck)
+{
+ int r0;
+ int r1;
+ int r2;
+
+ r0 = READ_ONCE(*flag);
+ if (r0 == 0) {
+ spin_lock(lck);
+ r1 = READ_ONCE(*flag);
+ if (r1 == 0) {
+ WRITE_ONCE(*data, 1);
+ WRITE_ONCE(*flag, 1);
+ }
+ spin_unlock(lck);
+ }
+ r2 = READ_ONCE(*data);
+}
+
+P1(int *flag, int *data, int *lck)
+{
+ int r0;
+ int r1;
+ int r2;
+
+ r0 = READ_ONCE(*flag);
+ if (r0 == 0) {
+ spin_lock(lck);
+ r1 = READ_ONCE(*flag);
+ if (r1 == 0) {
+ WRITE_ONCE(*data, 1);
+ WRITE_ONCE(*flag, 1);
+ }
+ spin_unlock(lck);
+ }
+ r2 = READ_ONCE(*data);
+}
+
+locations [flag;data;lck;0:r0;0:r1;1:r0;1:r1]
+exists (0:r2=0 \/ 1:r2=0)
diff --git a/Documentation/litmus-tests/locking/DCL-fixed.litmus b/Documentation/litmus-tests/locking/DCL-fixed.litmus
new file mode 100644
index 0000000000000..579d6c246f167
--- /dev/null
+++ b/Documentation/litmus-tests/locking/DCL-fixed.litmus
@@ -0,0 +1,56 @@
+C DCL-fixed
+
+(*
+ * Result: Never
+ *
+ * This litmus test demonstrates that double-checked locking can be
+ * reliable given proper use of smp_load_acquire() and smp_store_release()
+ * in addition to the locking.
+ *)
+
+{
+ int flag;
+ int data;
+ int lck;
+}
+
+P0(int *flag, int *data, int *lck)
+{
+ int r0;
+ int r1;
+ int r2;
+
+ r0 = smp_load_acquire(flag);
+ if (r0 == 0) {
+ spin_lock(lck);
+ r1 = READ_ONCE(*flag);
+ if (r1 == 0) {
+ WRITE_ONCE(*data, 1);
+ smp_store_release(flag, 1);
+ }
+ spin_unlock(lck);
+ }
+ r2 = READ_ONCE(*data);
+}
+
+P1(int *flag, int *data, int *lck)
+{
+ int r0;
+ int r1;
+ int r2;
+
+ r0 = smp_load_acquire(flag);
+ if (r0 == 0) {
+ spin_lock(lck);
+ r1 = READ_ONCE(*flag);
+ if (r1 == 0) {
+ WRITE_ONCE(*data, 1);
+ smp_store_release(flag, 1);
+ }
+ spin_unlock(lck);
+ }
+ r2 = READ_ONCE(*data);
+}
+
+locations [flag;data;lck;0:r0;0:r1;1:r0;1:r1]
+exists (0:r2=0 \/ 1:r2=0)
diff --git a/Documentation/litmus-tests/locking/RM-broken.litmus b/Documentation/litmus-tests/locking/RM-broken.litmus
new file mode 100644
index 0000000000000..c586ae4b547de
--- /dev/null
+++ b/Documentation/litmus-tests/locking/RM-broken.litmus
@@ -0,0 +1,42 @@
+C RM-broken
+
+(*
+ * Result: DEADLOCK
+ *
+ * This litmus test demonstrates that the old "roach motel" approach
+ * to locking, where code can be freely moved into critical sections,
+ * cannot be used in the Linux kernel.
+ *)
+
+{
+ int lck;
+ int x;
+ int y;
+}
+
+P0(int *x, int *y, int *lck)
+{
+ int r2;
+
+ spin_lock(lck);
+ r2 = atomic_inc_return(y);
+ WRITE_ONCE(*x, 1);
+ spin_unlock(lck);
+}
+
+P1(int *x, int *y, int *lck)
+{
+ int r0;
+ int r1;
+ int r2;
+
+ spin_lock(lck);
+ r0 = READ_ONCE(*x);
+ r1 = READ_ONCE(*x);
+ r2 = atomic_inc_return(y);
+ spin_unlock(lck);
+}
+
+locations [x;lck;0:r2;1:r0;1:r1;1:r2]
+filter (y=2 /\ 1:r0=0 /\ 1:r1=1)
+exists (1:r2=1)
diff --git a/Documentation/litmus-tests/locking/RM-fixed.litmus b/Documentation/litmus-tests/locking/RM-fixed.litmus
new file mode 100644
index 0000000000000..672856736b42e
--- /dev/null
+++ b/Documentation/litmus-tests/locking/RM-fixed.litmus
@@ -0,0 +1,42 @@
+C RM-fixed
+
+(*
+ * Result: Never
+ *
+ * This litmus test demonstrates that the old "roach motel" approach
+ * to locking, where code can be freely moved into critical sections,
+ * cannot be used in the Linux kernel.
+ *)
+
+{
+ int lck;
+ int x;
+ int y;
+}
+
+P0(int *x, int *y, int *lck)
+{
+ int r2;
+
+ spin_lock(lck);
+ r2 = atomic_inc_return(y);
+ WRITE_ONCE(*x, 1);
+ spin_unlock(lck);
+}
+
+P1(int *x, int *y, int *lck)
+{
+ int r0;
+ int r1;
+ int r2;
+
+ r0 = READ_ONCE(*x);
+ r1 = READ_ONCE(*x);
+ spin_lock(lck);
+ r2 = atomic_inc_return(y);
+ spin_unlock(lck);
+}
+
+locations [x;lck;0:r2;1:r0;1:r1;1:r2]
+filter (y=2 /\ 1:r0=0 /\ 1:r1=1)
+exists (1:r2=1)
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 57ec011921805..2e076a459a0c0 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -1128,6 +1128,7 @@ static int tsc_cs_enable(struct clocksource *cs)
static struct clocksource clocksource_tsc_early = {
.name = "tsc-early",
.rating = 299,
+ .uncertainty_margin = 32 * NSEC_PER_MSEC,
.read = read_tsc,
.mask = CLOCKSOURCE_MASK(64),
.flags = CLOCK_SOURCE_IS_CONTINUOUS |
@@ -1152,7 +1153,8 @@ static struct clocksource clocksource_tsc = {
.mask = CLOCKSOURCE_MASK(64),
.flags = CLOCK_SOURCE_IS_CONTINUOUS |
CLOCK_SOURCE_VALID_FOR_HRES |
- CLOCK_SOURCE_MUST_VERIFY,
+ CLOCK_SOURCE_MUST_VERIFY |
+ CLOCK_SOURCE_VERIFY_PERCPU,
.vdso_clock_mode = VDSO_CLOCKMODE_TSC,
.enable = tsc_cs_enable,
.resume = tsc_resume,
diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h
index d6ab416ee2d2c..1d42d4b173271 100644
--- a/include/linux/clocksource.h
+++ b/include/linux/clocksource.h
@@ -43,6 +43,8 @@ struct module;
* @shift: Cycle to nanosecond divisor (power of two)
* @max_idle_ns: Maximum idle time permitted by the clocksource (nsecs)
* @maxadj: Maximum adjustment value to mult (~11%)
+ * @uncertainty_margin: Maximum uncertainty in nanoseconds per half second.
+ * Zero says to use default WATCHDOG_THRESHOLD.
* @archdata: Optional arch-specific data
* @max_cycles: Maximum safe cycle value which won't overflow on
* multiplication
@@ -98,6 +100,7 @@ struct clocksource {
u32 shift;
u64 max_idle_ns;
u32 maxadj;
+ u32 uncertainty_margin;
#ifdef CONFIG_ARCH_CLOCKSOURCE_DATA
struct arch_clocksource_data archdata;
#endif
@@ -137,7 +140,7 @@ struct clocksource {
#define CLOCK_SOURCE_UNSTABLE 0x40
#define CLOCK_SOURCE_SUSPEND_NONSTOP 0x80
#define CLOCK_SOURCE_RESELECT 0x100
-
+#define CLOCK_SOURCE_VERIFY_PERCPU 0x200
/* simplify initialization of mask field */
#define CLOCKSOURCE_MASK(bits) GENMASK_ULL((bits) - 1, 0)
@@ -288,4 +291,7 @@ static inline void timer_probe(void) {}
#define TIMER_ACPI_DECLARE(name, table_id, fn) \
ACPI_DECLARE_PROBE_ENTRY(timer, name, table_id, 0, NULL, 0, fn)
+extern ulong max_cswd_read_retries;
+void clocksource_verify_percpu(struct clocksource *cs);
+
#endif /* _LINUX_CLOCKSOURCE_H */
diff --git a/include/linux/rculist.h b/include/linux/rculist.h
index f8633d37e3581..d29740be4833e 100644
--- a/include/linux/rculist.h
+++ b/include/linux/rculist.h
@@ -11,15 +11,6 @@
#include <linux/rcupdate.h>
/*
- * Why is there no list_empty_rcu()? Because list_empty() serves this
- * purpose. The list_empty() function fetches the RCU-protected pointer
- * and compares it to the address of the list head, but neither dereferences
- * this pointer itself nor provides this pointer to the caller. Therefore,
- * it is not necessary to use rcu_dereference(), so that list_empty() can
- * be used anywhere you would want to use a list_empty_rcu().
- */
-
-/*
* INIT_LIST_HEAD_RCU - Initialize a list_head visible to RCU readers
* @list: list to be initialized
*
@@ -318,21 +309,29 @@ static inline void list_splice_tail_init_rcu(struct list_head *list,
/*
* Where are list_empty_rcu() and list_first_entry_rcu()?
*
- * Implementing those functions following their counterparts list_empty() and
- * list_first_entry() is not advisable because they lead to subtle race
- * conditions as the following snippet shows:
+ * They do not exist because they would lead to subtle race conditions:
*
* if (!list_empty_rcu(mylist)) {
* struct foo *bar = list_first_entry_rcu(mylist, struct foo, list_member);
* do_something(bar);
* }
*
- * The list may not be empty when list_empty_rcu checks it, but it may be when
- * list_first_entry_rcu rereads the ->next pointer.
- *
- * Rereading the ->next pointer is not a problem for list_empty() and
- * list_first_entry() because they would be protected by a lock that blocks
- * writers.
+ * The list might be non-empty when list_empty_rcu() checks it, but it
+ * might have become empty by the time that list_first_entry_rcu() rereads
+ * the ->next pointer, which would result in a SEGV.
+ *
+ * When not using RCU, it is OK for list_first_entry() to re-read that
+ * pointer because both functions should be protected by some lock that
+ * blocks writers.
+ *
+ * When using RCU, list_empty() uses READ_ONCE() to fetch the
+ * RCU-protected ->next pointer and then compares it to the address of the
+ * list head. However, it neither dereferences this pointer nor provides
+ * this pointer to its caller. Thus, READ_ONCE() suffices (that is,
+ * rcu_dereference() is not needed), which means that list_empty() can be
+ * used anywhere you would want to use list_empty_rcu(). Just don't
+ * expect anything useful to happen if you do a subsequent lockless
+ * call to list_first_entry_rcu()!!!
*
* See list_first_or_null_rcu for an alternative.
*/
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 9455476c5ba22..cfeb43bfc719e 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -53,7 +53,7 @@ void __rcu_read_unlock(void);
* nesting depth, but makes sense only if CONFIG_PREEMPT_RCU -- in other
* types of kernel builds, the rcu_read_lock() nesting depth is unknowable.
*/
-#define rcu_preempt_depth() (current->rcu_read_lock_nesting)
+#define rcu_preempt_depth() READ_ONCE(current->rcu_read_lock_nesting)
#else /* #ifdef CONFIG_PREEMPT_RCU */
@@ -315,7 +315,7 @@ static inline int rcu_read_lock_any_held(void)
#define RCU_LOCKDEP_WARN(c, s) \
do { \
static bool __section(".data.unlikely") __warned; \
- if (debug_lockdep_rcu_enabled() && !__warned && (c)) { \
+ if ((c) && debug_lockdep_rcu_enabled() && !__warned) { \
__warned = true; \
lockdep_rcu_suspicious(__FILE__, __LINE__, s); \
} \
@@ -363,6 +363,20 @@ static inline void rcu_preempt_sleep_check(void) { }
#define rcu_check_sparse(p, space)
#endif /* #else #ifdef __CHECKER__ */
+/**
+ * unrcu_pointer - mark a pointer as not being RCU protected
+ * @p: pointer needing to lose its __rcu property
+ *
+ * Converts @p from an __rcu pointer to a __kernel pointer.
+ * This allows an __rcu pointer to be used with xchg() and friends.
+ */
+#define unrcu_pointer(p) \
+({ \
+ typeof(*p) *_________p1 = (typeof(*p) *__force)(p); \
+ rcu_check_sparse(p, __rcu); \
+ ((typeof(*p) __force __kernel *)(_________p1)); \
+})
+
#define __rcu_access_pointer(p, space) \
({ \
typeof(*p) *_________p1 = (typeof(*p) *__force)READ_ONCE(p); \
@@ -518,7 +532,12 @@ do { \
* @p: The pointer to read, prior to dereferencing
* @c: The conditions under which the dereference will take place
*
- * This is the RCU-bh counterpart to rcu_dereference_check().
+ * This is the RCU-bh counterpart to rcu_dereference_check(). However,
+ * please note that starting in v5.0 kernels, vanilla RCU grace periods
+ * wait for local_bh_disable() regions of code in addition to regions of
+ * code demarked by rcu_read_lock() and rcu_read_unlock(). This means
+ * that synchronize_rcu(), call_rcu, and friends all take not only
+ * rcu_read_lock() but also rcu_read_lock_bh() into account.
*/
#define rcu_dereference_bh_check(p, c) \
__rcu_dereference_check((p), (c) || rcu_read_lock_bh_held(), __rcu)
@@ -529,6 +548,11 @@ do { \
* @c: The conditions under which the dereference will take place
*
* This is the RCU-sched counterpart to rcu_dereference_check().
+ * However, please note that starting in v5.0 kernels, vanilla RCU grace
+ * periods wait for preempt_disable() regions of code in addition to
+ * regions of code demarked by rcu_read_lock() and rcu_read_unlock().
+ * This means that synchronize_rcu(), call_rcu, and friends all take not
+ * only rcu_read_lock() but also rcu_read_lock_sched() into account.
*/
#define rcu_dereference_sched_check(p, c) \
__rcu_dereference_check((p), (c) || rcu_read_lock_sched_held(), \
@@ -620,6 +644,12 @@ do { \
* sections, invocation of the corresponding RCU callback is deferred
* until after the all the other CPUs exit their critical sections.
*
+ * In v5.0 and later kernels, synchronize_rcu() and call_rcu() also
+ * wait for regions of code with preemption disabled, including regions of
+ * code with interrupts or softirqs disabled. In pre-v5.0 kernels, which
+ * define synchronize_sched(), only code enclosed within rcu_read_lock()
+ * and rcu_read_unlock() are guaranteed to be waited for.
+ *
* Note, however, that RCU callbacks are permitted to run concurrently
* with new RCU read-side critical sections. One way that this can happen
* is via the following sequence of events: (1) CPU 0 enters an RCU
@@ -672,33 +702,12 @@ static __always_inline void rcu_read_lock(void)
/**
* rcu_read_unlock() - marks the end of an RCU read-side critical section.
*
- * In most situations, rcu_read_unlock() is immune from deadlock.
- * However, in kernels built with CONFIG_RCU_BOOST, rcu_read_unlock()
- * is responsible for deboosting, which it does via rt_mutex_unlock().
- * Unfortunately, this function acquires the scheduler's runqueue and
- * priority-inheritance spinlocks. This means that deadlock could result
- * if the caller of rcu_read_unlock() already holds one of these locks or
- * any lock that is ever acquired while holding them.
- *
- * That said, RCU readers are never priority boosted unless they were
- * preempted. Therefore, one way to avoid deadlock is to make sure
- * that preemption never happens within any RCU read-side critical
- * section whose outermost rcu_read_unlock() is called with one of
- * rt_mutex_unlock()'s locks held. Such preemption can be avoided in
- * a number of ways, for example, by invoking preempt_disable() before
- * critical section's outermost rcu_read_lock().
- *
- * Given that the set of locks acquired by rt_mutex_unlock() might change
- * at any time, a somewhat more future-proofed approach is to make sure
- * that that preemption never happens within any RCU read-side critical
- * section whose outermost rcu_read_unlock() is called with irqs disabled.
- * This approach relies on the fact that rt_mutex_unlock() currently only
- * acquires irq-disabled locks.
- *
- * The second of these two approaches is best in most situations,
- * however, the first approach can also be useful, at least to those
- * developers willing to keep abreast of the set of locks acquired by
- * rt_mutex_unlock().
+ * In almost all situations, rcu_read_unlock() is immune from deadlock.
+ * In recent kernels that have consolidated synchronize_sched() and
+ * synchronize_rcu_bh() into synchronize_rcu(), this deadlock immunity
+ * also extends to the scheduler's runqueue and priority-inheritance
+ * spinlocks, courtesy of the quiescent-state deferral that is carried
+ * out when rcu_read_unlock() is invoked with interrupts disabled.
*
* See rcu_read_lock() for more information.
*/
@@ -714,9 +723,11 @@ static inline void rcu_read_unlock(void)
/**
* rcu_read_lock_bh() - mark the beginning of an RCU-bh critical section
*
- * This is equivalent of rcu_read_lock(), but also disables softirqs.
- * Note that anything else that disables softirqs can also serve as
- * an RCU read-side critical section.
+ * This is equivalent to rcu_read_lock(), but also disables softirqs.
+ * Note that anything else that disables softirqs can also serve as an RCU
+ * read-side critical section. However, please note that this equivalence
+ * applies only to v5.0 and later. Before v5.0, rcu_read_lock() and
+ * rcu_read_lock_bh() were unrelated.
*
* Note that rcu_read_lock_bh() and the matching rcu_read_unlock_bh()
* must occur in the same context, for example, it is illegal to invoke
@@ -749,9 +760,12 @@ static inline void rcu_read_unlock_bh(void)
/**
* rcu_read_lock_sched() - mark the beginning of a RCU-sched critical section
*
- * This is equivalent of rcu_read_lock(), but disables preemption.
- * Read-side critical sections can also be introduced by anything else
- * that disables preemption, including local_irq_disable() and friends.
+ * This is equivalent to rcu_read_lock(), but also disables preemption.
+ * Read-side critical sections can also be introduced by anything else that
+ * disables preemption, including local_irq_disable() and friends. However,
+ * please note that the equivalence to rcu_read_lock() applies only to
+ * v5.0 and later. Before v5.0, rcu_read_lock() and rcu_read_lock_sched()
+ * were unrelated.
*
* Note that rcu_read_lock_sched() and the matching rcu_read_unlock_sched()
* must occur in the same context, for example, it is illegal to invoke
diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index 35e0be326ffc7..9be015305f9f9 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -14,9 +14,6 @@
#include <asm/param.h> /* for HZ */
-/* Never flag non-existent other CPUs! */
-static inline bool rcu_eqs_special_set(int cpu) { return false; }
-
unsigned long get_state_synchronize_rcu(void);
unsigned long start_poll_synchronize_rcu(void);
bool poll_state_synchronize_rcu(unsigned long oldstate);
@@ -86,7 +83,6 @@ static inline void rcu_irq_enter(void) { }
static inline void rcu_irq_exit_irqson(void) { }
static inline void rcu_irq_enter_irqson(void) { }
static inline void rcu_irq_exit(void) { }
-static inline void rcu_irq_exit_preempt(void) { }
static inline void rcu_irq_exit_check_preempt(void) { }
#define rcu_is_idle_cpu(cpu) \
(is_idle_task(current) && !in_nmi() && !in_irq() && !in_serving_softirq())
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index b89b54130f496..53209d6694001 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -49,7 +49,6 @@ void rcu_idle_enter(void);
void rcu_idle_exit(void);
void rcu_irq_enter(void);
void rcu_irq_exit(void);
-void rcu_irq_exit_preempt(void);
void rcu_irq_enter_irqson(void);
void rcu_irq_exit_irqson(void);
bool rcu_is_idle_cpu(int cpu);
diff --git a/include/linux/srcu.h b/include/linux/srcu.h
index a0895bbf71ce0..e6011a9975af2 100644
--- a/include/linux/srcu.h
+++ b/include/linux/srcu.h
@@ -64,6 +64,12 @@ unsigned long get_state_synchronize_srcu(struct srcu_struct *ssp);
unsigned long start_poll_synchronize_srcu(struct srcu_struct *ssp);
bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie);
+#ifdef CONFIG_SRCU
+void srcu_init(void);
+#else /* #ifdef CONFIG_SRCU */
+static inline void srcu_init(void) { }
+#endif /* #else #ifdef CONFIG_SRCU */
+
#ifdef CONFIG_DEBUG_LOCK_ALLOC
/**
diff --git a/include/linux/srcutiny.h b/include/linux/srcutiny.h
index 0e0cf4d6a72a0..6cfaa0a9a9b96 100644
--- a/include/linux/srcutiny.h
+++ b/include/linux/srcutiny.h
@@ -61,7 +61,7 @@ static inline int __srcu_read_lock(struct srcu_struct *ssp)
int idx;
idx = ((READ_ONCE(ssp->srcu_idx) + 1) & 0x2) >> 1;
- WRITE_ONCE(ssp->srcu_lock_nesting[idx], ssp->srcu_lock_nesting[idx] + 1);
+ WRITE_ONCE(ssp->srcu_lock_nesting[idx], READ_ONCE(ssp->srcu_lock_nesting[idx]) + 1);
return idx;
}
@@ -81,11 +81,11 @@ static inline void srcu_torture_stats_print(struct srcu_struct *ssp,
{
int idx;
- idx = ((READ_ONCE(ssp->srcu_idx) + 1) & 0x2) >> 1;
+ idx = ((data_race(READ_ONCE(ssp->srcu_idx)) + 1) & 0x2) >> 1;
pr_alert("%s%s Tiny SRCU per-CPU(idx=%d): (%hd,%hd)\n",
tt, tf, idx,
- READ_ONCE(ssp->srcu_lock_nesting[!idx]),
- READ_ONCE(ssp->srcu_lock_nesting[idx]));
+ data_race(READ_ONCE(ssp->srcu_lock_nesting[!idx])),
+ data_race(READ_ONCE(ssp->srcu_lock_nesting[idx])));
}
#endif
diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h
index 9cfcc8a756ae3..cb1f4351e8baa 100644
--- a/include/linux/srcutree.h
+++ b/include/linux/srcutree.h
@@ -82,9 +82,7 @@ struct srcu_struct {
/* callback for the barrier */
/* operation. */
struct delayed_work work;
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
struct lockdep_map dep_map;
-#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
};
/* Values for state variable (bottom bits of ->srcu_gp_seq). */
diff --git a/include/linux/timer.h b/include/linux/timer.h
index 4118a97e62fb4..fda13c9d1256c 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -192,8 +192,6 @@ extern int try_to_del_timer_sync(struct timer_list *timer);
#define del_singleshot_timer_sync(t) del_timer_sync(t)
-extern bool timer_curr_running(struct timer_list *timer);
-
extern void init_timers(void);
struct hrtimer;
extern enum hrtimer_restart it_real_fn(struct hrtimer *);
diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h
index 6768b64bc738b..670e41783edd8 100644
--- a/include/trace/events/rcu.h
+++ b/include/trace/events/rcu.h
@@ -278,6 +278,7 @@ TRACE_EVENT_RCU(rcu_exp_funnel_lock,
* "WakeNot": Don't wake rcuo kthread.
* "WakeNotPoll": Don't wake rcuo kthread because it is polling.
* "WakeOvfIsDeferred": Wake rcuo kthread later, CB list is huge.
+ * "WakeBypassIsDeferred": Wake rcuo kthread later, bypass list is contended.
* "WokeEmpty": rcuo CB kthread woke to find empty list.
*/
TRACE_EVENT_RCU(rcu_nocb_wake,
diff --git a/init/main.c b/init/main.c
index 359358500e544..08438263a4b8c 100644
--- a/init/main.c
+++ b/init/main.c
@@ -42,6 +42,7 @@
#include <linux/profile.h>
#include <linux/kfence.h>
#include <linux/rcupdate.h>
+#include <linux/srcu.h>
#include <linux/moduleparam.h>
#include <linux/kallsyms.h>
#include <linux/writeback.h>
@@ -976,6 +977,7 @@ asmlinkage __visible void __init __no_sanitize_address start_kernel(void)
tick_init();
rcu_init_nohz();
init_timers();
+ srcu_init();
hrtimers_init();
softirq_init();
timekeeping_init();
diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c
index 45c821d4e8bd1..26709ea65c715 100644
--- a/kernel/kcsan/core.c
+++ b/kernel/kcsan/core.c
@@ -380,9 +380,7 @@ static noinline void kcsan_found_watchpoint(const volatile void *ptr,
if (consumed) {
kcsan_save_irqtrace(current);
- kcsan_report(ptr, size, type, KCSAN_VALUE_CHANGE_MAYBE,
- KCSAN_REPORT_CONSUMED_WATCHPOINT,
- watchpoint - watchpoints);
+ kcsan_report_set_info(ptr, size, type, watchpoint - watchpoints);
kcsan_restore_irqtrace(current);
} else {
/*
@@ -407,12 +405,7 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type)
const bool is_write = (type & KCSAN_ACCESS_WRITE) != 0;
const bool is_assert = (type & KCSAN_ACCESS_ASSERT) != 0;
atomic_long_t *watchpoint;
- union {
- u8 _1;
- u16 _2;
- u32 _4;
- u64 _8;
- } expect_value;
+ u64 old, new, diff;
unsigned long access_mask;
enum kcsan_value_change value_change = KCSAN_VALUE_CHANGE_MAYBE;
unsigned long ua_flags = user_access_save();
@@ -468,19 +461,19 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type)
* Read the current value, to later check and infer a race if the data
* was modified via a non-instrumented access, e.g. from a device.
*/
- expect_value._8 = 0;
+ old = 0;
switch (size) {
case 1:
- expect_value._1 = READ_ONCE(*(const u8 *)ptr);
+ old = READ_ONCE(*(const u8 *)ptr);
break;
case 2:
- expect_value._2 = READ_ONCE(*(const u16 *)ptr);
+ old = READ_ONCE(*(const u16 *)ptr);
break;
case 4:
- expect_value._4 = READ_ONCE(*(const u32 *)ptr);
+ old = READ_ONCE(*(const u32 *)ptr);
break;
case 8:
- expect_value._8 = READ_ONCE(*(const u64 *)ptr);
+ old = READ_ONCE(*(const u64 *)ptr);
break;
default:
break; /* ignore; we do not diff the values */
@@ -506,33 +499,30 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type)
* racy access.
*/
access_mask = get_ctx()->access_mask;
+ new = 0;
switch (size) {
case 1:
- expect_value._1 ^= READ_ONCE(*(const u8 *)ptr);
- if (access_mask)
- expect_value._1 &= (u8)access_mask;
+ new = READ_ONCE(*(const u8 *)ptr);
break;
case 2:
- expect_value._2 ^= READ_ONCE(*(const u16 *)ptr);
- if (access_mask)
- expect_value._2 &= (u16)access_mask;
+ new = READ_ONCE(*(const u16 *)ptr);
break;
case 4:
- expect_value._4 ^= READ_ONCE(*(const u32 *)ptr);
- if (access_mask)
- expect_value._4 &= (u32)access_mask;
+ new = READ_ONCE(*(const u32 *)ptr);
break;
case 8:
- expect_value._8 ^= READ_ONCE(*(const u64 *)ptr);
- if (access_mask)
- expect_value._8 &= (u64)access_mask;
+ new = READ_ONCE(*(const u64 *)ptr);
break;
default:
break; /* ignore; we do not diff the values */
}
+ diff = old ^ new;
+ if (access_mask)
+ diff &= access_mask;
+
/* Were we able to observe a value-change? */
- if (expect_value._8 != 0)
+ if (diff != 0)
value_change = KCSAN_VALUE_CHANGE_TRUE;
/* Check if this access raced with another. */
@@ -566,8 +556,9 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type)
if (is_assert && value_change == KCSAN_VALUE_CHANGE_TRUE)
atomic_long_inc(&kcsan_counters[KCSAN_COUNTER_ASSERT_FAILURES]);
- kcsan_report(ptr, size, type, value_change, KCSAN_REPORT_RACE_SIGNAL,
- watchpoint - watchpoints);
+ kcsan_report_known_origin(ptr, size, type, value_change,
+ watchpoint - watchpoints,
+ old, new, access_mask);
} else if (value_change == KCSAN_VALUE_CHANGE_TRUE) {
/* Inferring a race, since the value should not have changed. */
@@ -576,9 +567,7 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type)
atomic_long_inc(&kcsan_counters[KCSAN_COUNTER_ASSERT_FAILURES]);
if (IS_ENABLED(CONFIG_KCSAN_REPORT_RACE_UNKNOWN_ORIGIN) || is_assert)
- kcsan_report(ptr, size, type, KCSAN_VALUE_CHANGE_TRUE,
- KCSAN_REPORT_RACE_UNKNOWN_ORIGIN,
- watchpoint - watchpoints);
+ kcsan_report_unknown_origin(ptr, size, type, old, new, access_mask);
}
/*
diff --git a/kernel/kcsan/kcsan.h b/kernel/kcsan/kcsan.h
index 9881099d41791..f36e25c497ed8 100644
--- a/kernel/kcsan/kcsan.h
+++ b/kernel/kcsan/kcsan.h
@@ -116,30 +116,27 @@ enum kcsan_value_change {
KCSAN_VALUE_CHANGE_TRUE,
};
-enum kcsan_report_type {
- /*
- * The thread that set up the watchpoint and briefly stalled was
- * signalled that another thread triggered the watchpoint.
- */
- KCSAN_REPORT_RACE_SIGNAL,
-
- /*
- * A thread found and consumed a matching watchpoint.
- */
- KCSAN_REPORT_CONSUMED_WATCHPOINT,
+/*
+ * The calling thread hit and consumed a watchpoint: set the access information
+ * to be consumed by the reporting thread. No report is printed yet.
+ */
+void kcsan_report_set_info(const volatile void *ptr, size_t size, int access_type,
+ int watchpoint_idx);
- /*
- * No other thread was observed to race with the access, but the data
- * value before and after the stall differs.
- */
- KCSAN_REPORT_RACE_UNKNOWN_ORIGIN,
-};
+/*
+ * The calling thread observed that the watchpoint it set up was hit and
+ * consumed: print the full report based on information set by the racing
+ * thread.
+ */
+void kcsan_report_known_origin(const volatile void *ptr, size_t size, int access_type,
+ enum kcsan_value_change value_change, int watchpoint_idx,
+ u64 old, u64 new, u64 mask);
/*
- * Print a race report from thread that encountered the race.
+ * No other thread was observed to race with the access, but the data value
+ * before and after the stall differs. Reports a race of "unknown origin".
*/
-extern void kcsan_report(const volatile void *ptr, size_t size, int access_type,
- enum kcsan_value_change value_change,
- enum kcsan_report_type type, int watchpoint_idx);
+void kcsan_report_unknown_origin(const volatile void *ptr, size_t size, int access_type,
+ u64 old, u64 new, u64 mask);
#endif /* _KERNEL_KCSAN_KCSAN_H */
diff --git a/kernel/kcsan/report.c b/kernel/kcsan/report.c
index 13dce3c664d63..e37e4386f86dc 100644
--- a/kernel/kcsan/report.c
+++ b/kernel/kcsan/report.c
@@ -325,13 +325,10 @@ static void print_verbose_info(struct task_struct *task)
print_irqtrace_events(task);
}
-/*
- * Returns true if a report was generated, false otherwise.
- */
-static bool print_report(enum kcsan_value_change value_change,
- enum kcsan_report_type type,
+static void print_report(enum kcsan_value_change value_change,
const struct access_info *ai,
- const struct other_info *other_info)
+ const struct other_info *other_info,
+ u64 old, u64 new, u64 mask)
{
unsigned long stack_entries[NUM_STACK_ENTRIES] = { 0 };
int num_stack_entries = stack_trace_save(stack_entries, NUM_STACK_ENTRIES, 1);
@@ -344,25 +341,24 @@ static bool print_report(enum kcsan_value_change value_change,
* Must check report filter rules before starting to print.
*/
if (skip_report(KCSAN_VALUE_CHANGE_TRUE, stack_entries[skipnr]))
- return false;
+ return;
- if (type == KCSAN_REPORT_RACE_SIGNAL) {
+ if (other_info) {
other_skipnr = get_stack_skipnr(other_info->stack_entries,
other_info->num_stack_entries);
other_frame = other_info->stack_entries[other_skipnr];
/* @value_change is only known for the other thread */
if (skip_report(value_change, other_frame))
- return false;
+ return;
}
if (rate_limit_report(this_frame, other_frame))
- return false;
+ return;
/* Print report header. */
pr_err("==================================================================\n");
- switch (type) {
- case KCSAN_REPORT_RACE_SIGNAL: {
+ if (other_info) {
int cmp;
/*
@@ -374,22 +370,15 @@ static bool print_report(enum kcsan_value_change value_change,
get_bug_type(ai->access_type | other_info->ai.access_type),
(void *)(cmp < 0 ? other_frame : this_frame),
(void *)(cmp < 0 ? this_frame : other_frame));
- } break;
-
- case KCSAN_REPORT_RACE_UNKNOWN_ORIGIN:
+ } else {
pr_err("BUG: KCSAN: %s in %pS\n", get_bug_type(ai->access_type),
(void *)this_frame);
- break;
-
- default:
- BUG();
}
pr_err("\n");
/* Print information about the racing accesses. */
- switch (type) {
- case KCSAN_REPORT_RACE_SIGNAL:
+ if (other_info) {
pr_err("%s to 0x%px of %zu bytes by %s on cpu %i:\n",
get_access_type(other_info->ai.access_type), other_info->ai.ptr,
other_info->ai.size, get_thread_desc(other_info->ai.task_pid),
@@ -407,16 +396,10 @@ static bool print_report(enum kcsan_value_change value_change,
pr_err("%s to 0x%px of %zu bytes by %s on cpu %i:\n",
get_access_type(ai->access_type), ai->ptr, ai->size,
get_thread_desc(ai->task_pid), ai->cpu_id);
- break;
-
- case KCSAN_REPORT_RACE_UNKNOWN_ORIGIN:
+ } else {
pr_err("race at unknown origin, with %s to 0x%px of %zu bytes by %s on cpu %i:\n",
get_access_type(ai->access_type), ai->ptr, ai->size,
get_thread_desc(ai->task_pid), ai->cpu_id);
- break;
-
- default:
- BUG();
}
/* Print stack trace of this thread. */
stack_trace_print(stack_entries + skipnr, num_stack_entries - skipnr,
@@ -425,24 +408,41 @@ static bool print_report(enum kcsan_value_change value_change,
if (IS_ENABLED(CONFIG_KCSAN_VERBOSE))
print_verbose_info(current);
+ /* Print observed value change. */
+ if (ai->size <= 8) {
+ int hex_len = ai->size * 2;
+ u64 diff = old ^ new;
+
+ if (mask)
+ diff &= mask;
+ if (diff) {
+ pr_err("\n");
+ pr_err("value changed: 0x%0*llx -> 0x%0*llx\n",
+ hex_len, old, hex_len, new);
+ if (mask) {
+ pr_err(" bits changed: 0x%0*llx with mask 0x%0*llx\n",
+ hex_len, diff, hex_len, mask);
+ }
+ }
+ }
+
/* Print report footer. */
pr_err("\n");
pr_err("Reported by Kernel Concurrency Sanitizer on:\n");
dump_stack_print_info(KERN_DEFAULT);
pr_err("==================================================================\n");
- return true;
+ if (panic_on_warn)
+ panic("panic_on_warn set ...\n");
}
static void release_report(unsigned long *flags, struct other_info *other_info)
{
- if (other_info)
- /*
- * Use size to denote valid/invalid, since KCSAN entirely
- * ignores 0-sized accesses.
- */
- other_info->ai.size = 0;
-
+ /*
+ * Use size to denote valid/invalid, since KCSAN entirely ignores
+ * 0-sized accesses.
+ */
+ other_info->ai.size = 0;
raw_spin_unlock_irqrestore(&report_lock, *flags);
}
@@ -575,48 +575,42 @@ discard:
return false;
}
-/*
- * Depending on the report type either sets @other_info and returns false, or
- * awaits @other_info and returns true. If @other_info is not required for the
- * report type, simply acquires @report_lock and returns true.
- */
-static noinline bool prepare_report(unsigned long *flags,
- enum kcsan_report_type type,
- const struct access_info *ai,
- struct other_info *other_info)
+static struct access_info prepare_access_info(const volatile void *ptr, size_t size,
+ int access_type)
{
- switch (type) {
- case KCSAN_REPORT_CONSUMED_WATCHPOINT:
- prepare_report_producer(flags, ai, other_info);
- return false;
- case KCSAN_REPORT_RACE_SIGNAL:
- return prepare_report_consumer(flags, ai, other_info);
- default:
- /* @other_info not required; just acquire @report_lock. */
- raw_spin_lock_irqsave(&report_lock, *flags);
- return true;
- }
-}
-
-void kcsan_report(const volatile void *ptr, size_t size, int access_type,
- enum kcsan_value_change value_change,
- enum kcsan_report_type type, int watchpoint_idx)
-{
- unsigned long flags = 0;
- const struct access_info ai = {
+ return (struct access_info) {
.ptr = ptr,
.size = size,
.access_type = access_type,
.task_pid = in_task() ? task_pid_nr(current) : -1,
.cpu_id = raw_smp_processor_id()
};
- struct other_info *other_info = type == KCSAN_REPORT_RACE_UNKNOWN_ORIGIN
- ? NULL : &other_infos[watchpoint_idx];
+}
+
+void kcsan_report_set_info(const volatile void *ptr, size_t size, int access_type,
+ int watchpoint_idx)
+{
+ const struct access_info ai = prepare_access_info(ptr, size, access_type);
+ unsigned long flags;
kcsan_disable_current();
- if (WARN_ON(watchpoint_idx < 0 || watchpoint_idx >= ARRAY_SIZE(other_infos)))
- goto out;
+ lockdep_off(); /* See kcsan_report_known_origin(). */
+
+ prepare_report_producer(&flags, &ai, &other_infos[watchpoint_idx]);
+
+ lockdep_on();
+ kcsan_enable_current();
+}
+
+void kcsan_report_known_origin(const volatile void *ptr, size_t size, int access_type,
+ enum kcsan_value_change value_change, int watchpoint_idx,
+ u64 old, u64 new, u64 mask)
+{
+ const struct access_info ai = prepare_access_info(ptr, size, access_type);
+ struct other_info *other_info = &other_infos[watchpoint_idx];
+ unsigned long flags = 0;
+ kcsan_disable_current();
/*
* Because we may generate reports when we're in scheduler code, the use
* of printk() could deadlock. Until such time that all printing code
@@ -626,22 +620,35 @@ void kcsan_report(const volatile void *ptr, size_t size, int access_type,
*/
lockdep_off();
- if (prepare_report(&flags, type, &ai, other_info)) {
- /*
- * Never report if value_change is FALSE, only if we it is
- * either TRUE or MAYBE. In case of MAYBE, further filtering may
- * be done once we know the full stack trace in print_report().
- */
- bool reported = value_change != KCSAN_VALUE_CHANGE_FALSE &&
- print_report(value_change, type, &ai, other_info);
+ if (!prepare_report_consumer(&flags, &ai, other_info))
+ goto out;
+ /*
+ * Never report if value_change is FALSE, only when it is
+ * either TRUE or MAYBE. In case of MAYBE, further filtering may
+ * be done once we know the full stack trace in print_report().
+ */
+ if (value_change != KCSAN_VALUE_CHANGE_FALSE)
+ print_report(value_change, &ai, other_info, old, new, mask);
- if (reported && panic_on_warn)
- panic("panic_on_warn set ...\n");
+ release_report(&flags, other_info);
+out:
+ lockdep_on();
+ kcsan_enable_current();
+}
- release_report(&flags, other_info);
- }
+void kcsan_report_unknown_origin(const volatile void *ptr, size_t size, int access_type,
+ u64 old, u64 new, u64 mask)
+{
+ const struct access_info ai = prepare_access_info(ptr, size, access_type);
+ unsigned long flags;
+
+ kcsan_disable_current();
+ lockdep_off(); /* See kcsan_report_known_origin(). */
+
+ raw_spin_lock_irqsave(&report_lock, flags);
+ print_report(KCSAN_VALUE_CHANGE_TRUE, &ai, NULL, old, new, mask);
+ raw_spin_unlock_irqrestore(&report_lock, flags);
lockdep_on();
-out:
kcsan_enable_current();
}
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 7641bd4072390..cb8af6a3e1540 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -6393,6 +6393,7 @@ asmlinkage __visible void lockdep_sys_exit(void)
void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
{
struct task_struct *curr = current;
+ int dl = READ_ONCE(debug_locks);
/* Note: the following can be executed concurrently, so be careful. */
pr_warn("\n");
@@ -6402,11 +6403,12 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
pr_warn("-----------------------------\n");
pr_warn("%s:%d %s!\n", file, line, s);
pr_warn("\nother info that might help us debug this:\n\n");
- pr_warn("\n%srcu_scheduler_active = %d, debug_locks = %d\n",
+ pr_warn("\n%srcu_scheduler_active = %d, debug_locks = %d\n%s",
!rcu_lockdep_current_cpu_online()
? "RCU used illegally from offline CPU!\n"
: "",
- rcu_scheduler_active, debug_locks);
+ rcu_scheduler_active, dl,
+ dl ? "" : "Possible false positive due to lockdep disabling via debug_locks = 0\n");
/*
* If a CPU is in the RCU-free window in idle (ie: in the section
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index b3adb40549bf3..7c5a4a087cc73 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -59,7 +59,7 @@ static struct task_struct **writer_tasks;
static struct task_struct **reader_tasks;
static bool lock_is_write_held;
-static bool lock_is_read_held;
+static atomic_t lock_is_read_held;
static unsigned long last_lock_release;
struct lock_stress_stats {
@@ -682,7 +682,7 @@ static int lock_torture_writer(void *arg)
if (WARN_ON_ONCE(lock_is_write_held))
lwsp->n_lock_fail++;
lock_is_write_held = true;
- if (WARN_ON_ONCE(lock_is_read_held))
+ if (WARN_ON_ONCE(atomic_read(&lock_is_read_held)))
lwsp->n_lock_fail++; /* rare, but... */
lwsp->n_lock_acquired++;
@@ -717,13 +717,13 @@ static int lock_torture_reader(void *arg)
schedule_timeout_uninterruptible(1);
cxt.cur_ops->readlock(tid);
- lock_is_read_held = true;
+ atomic_inc(&lock_is_read_held);
if (WARN_ON_ONCE(lock_is_write_held))
lrsp->n_lock_fail++; /* rare, but... */
lrsp->n_lock_acquired++;
cxt.cur_ops->read_delay(&rand);
- lock_is_read_held = false;
+ atomic_dec(&lock_is_read_held);
cxt.cur_ops->readunlock(tid);
stutter_wait("lock_torture_reader");
@@ -738,20 +738,22 @@ static int lock_torture_reader(void *arg)
static void __torture_print_stats(char *page,
struct lock_stress_stats *statp, bool write)
{
+ long cur;
bool fail = false;
int i, n_stress;
- long max = 0, min = statp ? statp[0].n_lock_acquired : 0;
+ long max = 0, min = statp ? data_race(statp[0].n_lock_acquired) : 0;
long long sum = 0;
n_stress = write ? cxt.nrealwriters_stress : cxt.nrealreaders_stress;
for (i = 0; i < n_stress; i++) {
- if (statp[i].n_lock_fail)
+ if (data_race(statp[i].n_lock_fail))
fail = true;
- sum += statp[i].n_lock_acquired;
- if (max < statp[i].n_lock_acquired)
- max = statp[i].n_lock_acquired;
- if (min > statp[i].n_lock_acquired)
- min = statp[i].n_lock_acquired;
+ cur = data_race(statp[i].n_lock_acquired);
+ sum += cur;
+ if (max < cur)
+ max = cur;
+ if (min > cur)
+ min = cur;
}
page += sprintf(page,
"%s: Total: %lld Max/Min: %ld/%ld %s Fail: %d %s\n",
@@ -996,7 +998,6 @@ static int __init lock_torture_init(void)
}
if (nreaders_stress) {
- lock_is_read_held = false;
cxt.lrsa = kmalloc_array(cxt.nrealreaders_stress,
sizeof(*cxt.lrsa),
GFP_KERNEL);
diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug
index 1942c1f1bb65d..4fd64999300fc 100644
--- a/kernel/rcu/Kconfig.debug
+++ b/kernel/rcu/Kconfig.debug
@@ -116,7 +116,7 @@ config RCU_EQS_DEBUG
config RCU_STRICT_GRACE_PERIOD
bool "Provide debug RCU implementation with short grace periods"
- depends on DEBUG_KERNEL && RCU_EXPERT
+ depends on DEBUG_KERNEL && RCU_EXPERT && NR_CPUS <= 4
default n
select PREEMPT_COUNT if PREEMPT=n
help
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index bf0827d4b6593..24b5f2c2de87b 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -308,6 +308,8 @@ static inline void rcu_init_levelspread(int *levelspread, const int *levelcnt)
}
}
+extern void rcu_init_geometry(void);
+
/* Returns a pointer to the first leaf rcu_node structure. */
#define rcu_first_leaf_node() (rcu_state.level[rcu_num_lvls - 1])
@@ -422,12 +424,6 @@ do { \
#endif /* #if defined(CONFIG_SRCU) || !defined(CONFIG_TINY_RCU) */
-#ifdef CONFIG_SRCU
-void srcu_init(void);
-#else /* #ifdef CONFIG_SRCU */
-static inline void srcu_init(void) { }
-#endif /* #else #ifdef CONFIG_SRCU */
-
#ifdef CONFIG_TINY_RCU
/* Tiny RCU doesn't expedite, as its purpose in life is instead to be tiny. */
static inline bool rcu_gp_is_normal(void) { return true; }
@@ -441,7 +437,11 @@ bool rcu_gp_is_expedited(void); /* Internal RCU use. */
void rcu_expedite_gp(void);
void rcu_unexpedite_gp(void);
void rcupdate_announce_bootup_oddness(void);
+#ifdef CONFIG_TASKS_RCU_GENERIC
void show_rcu_tasks_gp_kthreads(void);
+#else /* #ifdef CONFIG_TASKS_RCU_GENERIC */
+static inline void show_rcu_tasks_gp_kthreads(void) {}
+#endif /* #else #ifdef CONFIG_TASKS_RCU_GENERIC */
void rcu_request_urgent_qs_task(struct task_struct *t);
#endif /* #else #ifdef CONFIG_TINY_RCU */
@@ -519,6 +519,7 @@ static inline unsigned long rcu_exp_batches_completed(void) { return 0; }
static inline unsigned long
srcu_batches_completed(struct srcu_struct *sp) { return 0; }
static inline void rcu_force_quiescent_state(void) { }
+static inline bool rcu_check_boost_fail(unsigned long gp_state, int *cpup) { return true; }
static inline void show_rcu_gp_kthreads(void) { }
static inline int rcu_get_gp_kthreads_prio(void) { return 0; }
static inline void rcu_fwd_progress_check(unsigned long j) { }
@@ -527,6 +528,7 @@ bool rcu_dynticks_zero_in_eqs(int cpu, int *vp);
unsigned long rcu_get_gp_seq(void);
unsigned long rcu_exp_batches_completed(void);
unsigned long srcu_batches_completed(struct srcu_struct *sp);
+bool rcu_check_boost_fail(unsigned long gp_state, int *cpup);
void show_rcu_gp_kthreads(void);
int rcu_get_gp_kthreads_prio(void);
void rcu_fwd_progress_check(unsigned long j);
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 29d2f4c647d3a..70c7e265bb226 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -245,12 +245,6 @@ static const char *rcu_torture_writer_state_getname(void)
return rcu_torture_writer_state_names[i];
}
-#if defined(CONFIG_RCU_BOOST) && defined(CONFIG_PREEMPT_RT)
-# define rcu_can_boost() 1
-#else
-# define rcu_can_boost() 0
-#endif
-
#ifdef CONFIG_RCU_TRACE
static u64 notrace rcu_trace_clock_local(void)
{
@@ -331,6 +325,7 @@ struct rcu_torture_ops {
void (*read_delay)(struct torture_random_state *rrsp,
struct rt_read_seg *rtrsp);
void (*readunlock)(int idx);
+ int (*readlock_held)(void);
unsigned long (*get_gp_seq)(void);
unsigned long (*gp_diff)(unsigned long new, unsigned long old);
void (*deferred_free)(struct rcu_torture *p);
@@ -345,6 +340,7 @@ struct rcu_torture_ops {
void (*fqs)(void);
void (*stats)(void);
void (*gp_kthread_dbg)(void);
+ bool (*check_boost_failed)(unsigned long gp_state, int *cpup);
int (*stall_dur)(void);
int irq_capable;
int can_boost;
@@ -359,6 +355,11 @@ static struct rcu_torture_ops *cur_ops;
* Definitions for rcu torture testing.
*/
+static int torture_readlock_not_held(void)
+{
+ return rcu_read_lock_bh_held() || rcu_read_lock_sched_held();
+}
+
static int rcu_torture_read_lock(void) __acquires(RCU)
{
rcu_read_lock();
@@ -483,30 +484,32 @@ static void rcu_sync_torture_init(void)
}
static struct rcu_torture_ops rcu_ops = {
- .ttype = RCU_FLAVOR,
- .init = rcu_sync_torture_init,
- .readlock = rcu_torture_read_lock,
- .read_delay = rcu_read_delay,
- .readunlock = rcu_torture_read_unlock,
- .get_gp_seq = rcu_get_gp_seq,
- .gp_diff = rcu_seq_diff,
- .deferred_free = rcu_torture_deferred_free,
- .sync = synchronize_rcu,
- .exp_sync = synchronize_rcu_expedited,
- .get_gp_state = get_state_synchronize_rcu,
- .start_gp_poll = start_poll_synchronize_rcu,
- .poll_gp_state = poll_state_synchronize_rcu,
- .cond_sync = cond_synchronize_rcu,
- .call = call_rcu,
- .cb_barrier = rcu_barrier,
- .fqs = rcu_force_quiescent_state,
- .stats = NULL,
- .gp_kthread_dbg = show_rcu_gp_kthreads,
- .stall_dur = rcu_jiffies_till_stall_check,
- .irq_capable = 1,
- .can_boost = rcu_can_boost(),
- .extendables = RCUTORTURE_MAX_EXTEND,
- .name = "rcu"
+ .ttype = RCU_FLAVOR,
+ .init = rcu_sync_torture_init,
+ .readlock = rcu_torture_read_lock,
+ .read_delay = rcu_read_delay,
+ .readunlock = rcu_torture_read_unlock,
+ .readlock_held = torture_readlock_not_held,
+ .get_gp_seq = rcu_get_gp_seq,
+ .gp_diff = rcu_seq_diff,
+ .deferred_free = rcu_torture_deferred_free,
+ .sync = synchronize_rcu,
+ .exp_sync = synchronize_rcu_expedited,
+ .get_gp_state = get_state_synchronize_rcu,
+ .start_gp_poll = start_poll_synchronize_rcu,
+ .poll_gp_state = poll_state_synchronize_rcu,
+ .cond_sync = cond_synchronize_rcu,
+ .call = call_rcu,
+ .cb_barrier = rcu_barrier,
+ .fqs = rcu_force_quiescent_state,
+ .stats = NULL,
+ .gp_kthread_dbg = show_rcu_gp_kthreads,
+ .check_boost_failed = rcu_check_boost_fail,
+ .stall_dur = rcu_jiffies_till_stall_check,
+ .irq_capable = 1,
+ .can_boost = IS_ENABLED(CONFIG_RCU_BOOST),
+ .extendables = RCUTORTURE_MAX_EXTEND,
+ .name = "rcu"
};
/*
@@ -540,6 +543,7 @@ static struct rcu_torture_ops rcu_busted_ops = {
.readlock = rcu_torture_read_lock,
.read_delay = rcu_read_delay, /* just reuse rcu's version. */
.readunlock = rcu_torture_read_unlock,
+ .readlock_held = torture_readlock_not_held,
.get_gp_seq = rcu_no_completed,
.deferred_free = rcu_busted_torture_deferred_free,
.sync = synchronize_rcu_busted,
@@ -589,6 +593,11 @@ static void srcu_torture_read_unlock(int idx) __releases(srcu_ctlp)
srcu_read_unlock(srcu_ctlp, idx);
}
+static int torture_srcu_read_lock_held(void)
+{
+ return srcu_read_lock_held(srcu_ctlp);
+}
+
static unsigned long srcu_torture_completed(void)
{
return srcu_batches_completed(srcu_ctlp);
@@ -646,6 +655,7 @@ static struct rcu_torture_ops srcu_ops = {
.readlock = srcu_torture_read_lock,
.read_delay = srcu_read_delay,
.readunlock = srcu_torture_read_unlock,
+ .readlock_held = torture_srcu_read_lock_held,
.get_gp_seq = srcu_torture_completed,
.deferred_free = srcu_torture_deferred_free,
.sync = srcu_torture_synchronize,
@@ -681,6 +691,7 @@ static struct rcu_torture_ops srcud_ops = {
.readlock = srcu_torture_read_lock,
.read_delay = srcu_read_delay,
.readunlock = srcu_torture_read_unlock,
+ .readlock_held = torture_srcu_read_lock_held,
.get_gp_seq = srcu_torture_completed,
.deferred_free = srcu_torture_deferred_free,
.sync = srcu_torture_synchronize,
@@ -700,6 +711,7 @@ static struct rcu_torture_ops busted_srcud_ops = {
.readlock = srcu_torture_read_lock,
.read_delay = rcu_read_delay,
.readunlock = srcu_torture_read_unlock,
+ .readlock_held = torture_srcu_read_lock_held,
.get_gp_seq = srcu_torture_completed,
.deferred_free = srcu_torture_deferred_free,
.sync = srcu_torture_synchronize,
@@ -787,6 +799,7 @@ static struct rcu_torture_ops trivial_ops = {
.readlock = rcu_torture_read_lock_trivial,
.read_delay = rcu_read_delay, /* just reuse rcu's version. */
.readunlock = rcu_torture_read_unlock_trivial,
+ .readlock_held = torture_readlock_not_held,
.get_gp_seq = rcu_no_completed,
.sync = synchronize_rcu_trivial,
.exp_sync = synchronize_rcu_trivial,
@@ -850,6 +863,7 @@ static struct rcu_torture_ops tasks_tracing_ops = {
.readlock = tasks_tracing_torture_read_lock,
.read_delay = srcu_read_delay, /* just reuse srcu's version. */
.readunlock = tasks_tracing_torture_read_unlock,
+ .readlock_held = rcu_read_lock_trace_held,
.get_gp_seq = rcu_no_completed,
.deferred_free = rcu_tasks_tracing_torture_deferred_free,
.sync = synchronize_rcu_tasks_trace,
@@ -871,32 +885,13 @@ static unsigned long rcutorture_seq_diff(unsigned long new, unsigned long old)
return cur_ops->gp_diff(new, old);
}
-static bool __maybe_unused torturing_tasks(void)
-{
- return cur_ops == &tasks_ops || cur_ops == &tasks_rude_ops;
-}
-
/*
* RCU torture priority-boost testing. Runs one real-time thread per
- * CPU for moderate bursts, repeatedly registering RCU callbacks and
- * spinning waiting for them to be invoked. If a given callback takes
- * too long to be invoked, we assume that priority inversion has occurred.
+ * CPU for moderate bursts, repeatedly starting grace periods and waiting
+ * for them to complete. If a given grace period takes too long, we assume
+ * that priority inversion has occurred.
*/
-struct rcu_boost_inflight {
- struct rcu_head rcu;
- int inflight;
-};
-
-static void rcu_torture_boost_cb(struct rcu_head *head)
-{
- struct rcu_boost_inflight *rbip =
- container_of(head, struct rcu_boost_inflight, rcu);
-
- /* Ensure RCU-core accesses precede clearing ->inflight */
- smp_store_release(&rbip->inflight, 0);
-}
-
static int old_rt_runtime = -1;
static void rcu_torture_disable_rt_throttle(void)
@@ -923,49 +918,68 @@ static void rcu_torture_enable_rt_throttle(void)
old_rt_runtime = -1;
}
-static bool rcu_torture_boost_failed(unsigned long start, unsigned long end)
+static bool rcu_torture_boost_failed(unsigned long gp_state, unsigned long *start)
{
+ int cpu;
static int dbg_done;
-
- if (end - start > test_boost_duration * HZ - HZ / 2) {
+ unsigned long end = jiffies;
+ bool gp_done;
+ unsigned long j;
+ static unsigned long last_persist;
+ unsigned long lp;
+ unsigned long mininterval = test_boost_duration * HZ - HZ / 2;
+
+ if (end - *start > mininterval) {
+ // Recheck after checking time to avoid false positives.
+ smp_mb(); // Time check before grace-period check.
+ if (cur_ops->poll_gp_state(gp_state))
+ return false; // passed, though perhaps just barely
+ if (cur_ops->check_boost_failed && !cur_ops->check_boost_failed(gp_state, &cpu)) {
+ // At most one persisted message per boost test.
+ j = jiffies;
+ lp = READ_ONCE(last_persist);
+ if (time_after(j, lp + mininterval) && cmpxchg(&last_persist, lp, j) == lp)
+ pr_info("Boost inversion persisted: No QS from CPU %d\n", cpu);
+ return false; // passed on a technicality
+ }
VERBOSE_TOROUT_STRING("rcu_torture_boost boosting failed");
n_rcu_torture_boost_failure++;
- if (!xchg(&dbg_done, 1) && cur_ops->gp_kthread_dbg)
+ if (!xchg(&dbg_done, 1) && cur_ops->gp_kthread_dbg) {
+ pr_info("Boost inversion thread ->rt_priority %u gp_state %lu jiffies %lu\n",
+ current->rt_priority, gp_state, end - *start);
cur_ops->gp_kthread_dbg();
+ // Recheck after print to flag grace period ending during splat.
+ gp_done = cur_ops->poll_gp_state(gp_state);
+ pr_info("Boost inversion: GP %lu %s.\n", gp_state,
+ gp_done ? "ended already" : "still pending");
- return true; /* failed */
+ }
+
+ return true; // failed
+ } else if (cur_ops->check_boost_failed && !cur_ops->check_boost_failed(gp_state, NULL)) {
+ *start = jiffies;
}
- return false; /* passed */
+ return false; // passed
}
static int rcu_torture_boost(void *arg)
{
- unsigned long call_rcu_time;
unsigned long endtime;
+ unsigned long gp_state;
+ unsigned long gp_state_time;
unsigned long oldstarttime;
- struct rcu_boost_inflight rbi = { .inflight = 0 };
VERBOSE_TOROUT_STRING("rcu_torture_boost started");
/* Set real-time priority. */
sched_set_fifo_low(current);
- init_rcu_head_on_stack(&rbi.rcu);
/* Each pass through the following loop does one boost-test cycle. */
do {
bool failed = false; // Test failed already in this test interval
- bool firsttime = true;
+ bool gp_initiated = false;
- /* Increment n_rcu_torture_boosts once per boost-test */
- while (!kthread_should_stop()) {
- if (mutex_trylock(&boost_mutex)) {
- n_rcu_torture_boosts++;
- mutex_unlock(&boost_mutex);
- break;
- }
- schedule_timeout_uninterruptible(1);
- }
if (kthread_should_stop())
goto checkwait;
@@ -979,33 +993,33 @@ static int rcu_torture_boost(void *arg)
goto checkwait;
}
- /* Do one boost-test interval. */
+ // Do one boost-test interval.
endtime = oldstarttime + test_boost_duration * HZ;
while (time_before(jiffies, endtime)) {
- /* If we don't have a callback in flight, post one. */
- if (!smp_load_acquire(&rbi.inflight)) {
- /* RCU core before ->inflight = 1. */
- smp_store_release(&rbi.inflight, 1);
- cur_ops->call(&rbi.rcu, rcu_torture_boost_cb);
- /* Check if the boost test failed */
- if (!firsttime && !failed)
- failed = rcu_torture_boost_failed(call_rcu_time, jiffies);
- call_rcu_time = jiffies;
- firsttime = false;
+ // Has current GP gone too long?
+ if (gp_initiated && !failed && !cur_ops->poll_gp_state(gp_state))
+ failed = rcu_torture_boost_failed(gp_state, &gp_state_time);
+ // If we don't have a grace period in flight, start one.
+ if (!gp_initiated || cur_ops->poll_gp_state(gp_state)) {
+ gp_state = cur_ops->start_gp_poll();
+ gp_initiated = true;
+ gp_state_time = jiffies;
}
- if (stutter_wait("rcu_torture_boost"))
+ if (stutter_wait("rcu_torture_boost")) {
sched_set_fifo_low(current);
+ // If the grace period already ended,
+ // we don't know when that happened, so
+ // start over.
+ if (cur_ops->poll_gp_state(gp_state))
+ gp_initiated = false;
+ }
if (torture_must_stop())
goto checkwait;
}
- /*
- * If boost never happened, then inflight will always be 1, in
- * this case the boost check would never happen in the above
- * loop so do another one here.
- */
- if (!firsttime && !failed && smp_load_acquire(&rbi.inflight))
- rcu_torture_boost_failed(call_rcu_time, jiffies);
+ // In case the grace period extended beyond the end of the loop.
+ if (gp_initiated && !failed && !cur_ops->poll_gp_state(gp_state))
+ rcu_torture_boost_failed(gp_state, &gp_state_time);
/*
* Set the start time of the next test interval.
@@ -1014,11 +1028,12 @@ static int rcu_torture_boost(void *arg)
* interval. Besides, we are running at RT priority,
* so delays should be relatively rare.
*/
- while (oldstarttime == boost_starttime &&
- !kthread_should_stop()) {
+ while (oldstarttime == boost_starttime && !kthread_should_stop()) {
if (mutex_trylock(&boost_mutex)) {
- boost_starttime = jiffies +
- test_boost_interval * HZ;
+ if (oldstarttime == boost_starttime) {
+ boost_starttime = jiffies + test_boost_interval * HZ;
+ n_rcu_torture_boosts++;
+ }
mutex_unlock(&boost_mutex);
break;
}
@@ -1030,15 +1045,11 @@ checkwait: if (stutter_wait("rcu_torture_boost"))
sched_set_fifo_low(current);
} while (!torture_must_stop());
- while (smp_load_acquire(&rbi.inflight))
- schedule_timeout_uninterruptible(1); // rcu_barrier() deadlocks.
-
/* Clean up and exit. */
- while (!kthread_should_stop() || smp_load_acquire(&rbi.inflight)) {
+ while (!kthread_should_stop()) {
torture_shutdown_absorb("rcu_torture_boost");
schedule_timeout_uninterruptible(1);
}
- destroy_rcu_head_on_stack(&rbi.rcu);
torture_kthread_stopping("rcu_torture_boost");
return 0;
}
@@ -1553,11 +1564,7 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp, long myid)
started = cur_ops->get_gp_seq();
ts = rcu_trace_clock_local();
p = rcu_dereference_check(rcu_torture_current,
- rcu_read_lock_bh_held() ||
- rcu_read_lock_sched_held() ||
- srcu_read_lock_held(srcu_ctlp) ||
- rcu_read_lock_trace_held() ||
- torturing_tasks());
+ !cur_ops->readlock_held || cur_ops->readlock_held());
if (p == NULL) {
/* Wait for rcu_torture_writer to get underway */
rcutorture_one_extend(&readstate, 0, trsp, rtrsp);
@@ -1861,48 +1868,49 @@ rcu_torture_stats(void *arg)
torture_shutdown_absorb("rcu_torture_stats");
} while (!torture_must_stop());
torture_kthread_stopping("rcu_torture_stats");
-
- {
- struct rcu_head *rhp;
- struct kmem_cache *kcp;
- static int z;
-
- kcp = kmem_cache_create("rcuscale", 136, 8, SLAB_STORE_USER, NULL);
- rhp = kmem_cache_alloc(kcp, GFP_KERNEL);
- pr_alert("mem_dump_obj() slab test: rcu_torture_stats = %px, &rhp = %px, rhp = %px, &z = %px\n", stats_task, &rhp, rhp, &z);
- pr_alert("mem_dump_obj(ZERO_SIZE_PTR):");
- mem_dump_obj(ZERO_SIZE_PTR);
- pr_alert("mem_dump_obj(NULL):");
- mem_dump_obj(NULL);
- pr_alert("mem_dump_obj(%px):", &rhp);
- mem_dump_obj(&rhp);
- pr_alert("mem_dump_obj(%px):", rhp);
- mem_dump_obj(rhp);
- pr_alert("mem_dump_obj(%px):", &rhp->func);
- mem_dump_obj(&rhp->func);
- pr_alert("mem_dump_obj(%px):", &z);
- mem_dump_obj(&z);
- kmem_cache_free(kcp, rhp);
- kmem_cache_destroy(kcp);
- rhp = kmalloc(sizeof(*rhp), GFP_KERNEL);
- pr_alert("mem_dump_obj() kmalloc test: rcu_torture_stats = %px, &rhp = %px, rhp = %px\n", stats_task, &rhp, rhp);
- pr_alert("mem_dump_obj(kmalloc %px):", rhp);
- mem_dump_obj(rhp);
- pr_alert("mem_dump_obj(kmalloc %px):", &rhp->func);
- mem_dump_obj(&rhp->func);
- kfree(rhp);
- rhp = vmalloc(4096);
- pr_alert("mem_dump_obj() vmalloc test: rcu_torture_stats = %px, &rhp = %px, rhp = %px\n", stats_task, &rhp, rhp);
- pr_alert("mem_dump_obj(vmalloc %px):", rhp);
- mem_dump_obj(rhp);
- pr_alert("mem_dump_obj(vmalloc %px):", &rhp->func);
- mem_dump_obj(&rhp->func);
- vfree(rhp);
- }
-
return 0;
}
+/* Test mem_dump_obj() and friends. */
+static void rcu_torture_mem_dump_obj(void)
+{
+ struct rcu_head *rhp;
+ struct kmem_cache *kcp;
+ static int z;
+
+ kcp = kmem_cache_create("rcuscale", 136, 8, SLAB_STORE_USER, NULL);
+ rhp = kmem_cache_alloc(kcp, GFP_KERNEL);
+ pr_alert("mem_dump_obj() slab test: rcu_torture_stats = %px, &rhp = %px, rhp = %px, &z = %px\n", stats_task, &rhp, rhp, &z);
+ pr_alert("mem_dump_obj(ZERO_SIZE_PTR):");
+ mem_dump_obj(ZERO_SIZE_PTR);
+ pr_alert("mem_dump_obj(NULL):");
+ mem_dump_obj(NULL);
+ pr_alert("mem_dump_obj(%px):", &rhp);
+ mem_dump_obj(&rhp);
+ pr_alert("mem_dump_obj(%px):", rhp);
+ mem_dump_obj(rhp);
+ pr_alert("mem_dump_obj(%px):", &rhp->func);
+ mem_dump_obj(&rhp->func);
+ pr_alert("mem_dump_obj(%px):", &z);
+ mem_dump_obj(&z);
+ kmem_cache_free(kcp, rhp);
+ kmem_cache_destroy(kcp);
+ rhp = kmalloc(sizeof(*rhp), GFP_KERNEL);
+ pr_alert("mem_dump_obj() kmalloc test: rcu_torture_stats = %px, &rhp = %px, rhp = %px\n", stats_task, &rhp, rhp);
+ pr_alert("mem_dump_obj(kmalloc %px):", rhp);
+ mem_dump_obj(rhp);
+ pr_alert("mem_dump_obj(kmalloc %px):", &rhp->func);
+ mem_dump_obj(&rhp->func);
+ kfree(rhp);
+ rhp = vmalloc(4096);
+ pr_alert("mem_dump_obj() vmalloc test: rcu_torture_stats = %px, &rhp = %px, rhp = %px\n", stats_task, &rhp, rhp);
+ pr_alert("mem_dump_obj(vmalloc %px):", rhp);
+ mem_dump_obj(rhp);
+ pr_alert("mem_dump_obj(vmalloc %px):", &rhp->func);
+ mem_dump_obj(&rhp->func);
+ vfree(rhp);
+}
+
static void
rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag)
{
@@ -2014,8 +2022,13 @@ static int rcu_torture_stall(void *args)
__func__, raw_smp_processor_id());
while (ULONG_CMP_LT((unsigned long)ktime_get_seconds(),
stop_at))
- if (stall_cpu_block)
+ if (stall_cpu_block) {
+#ifdef CONFIG_PREEMPTION
+ preempt_schedule();
+#else
schedule_timeout_uninterruptible(HZ);
+#endif
+ }
if (stall_cpu_irqsoff)
local_irq_enable();
else if (!stall_cpu_block)
@@ -2634,7 +2647,7 @@ static bool rcu_torture_can_boost(void)
if (!(test_boost == 1 && cur_ops->can_boost) && test_boost != 2)
return false;
- if (!cur_ops->call)
+ if (!cur_ops->start_gp_poll || !cur_ops->poll_gp_state)
return false;
prio = rcu_get_gp_kthreads_prio();
@@ -2642,7 +2655,7 @@ static bool rcu_torture_can_boost(void)
return false;
if (prio < 2) {
- if (boost_warn_once == 1)
+ if (boost_warn_once == 1)
return false;
pr_alert("%s: WARN: RCU kthread priority too low to test boosting. Skipping RCU boost test. Try passing rcutree.kthread_prio > 1 on the kernel command line.\n", KBUILD_MODNAME);
@@ -2818,6 +2831,8 @@ rcu_torture_cleanup(void)
if (cur_ops->cleanup != NULL)
cur_ops->cleanup();
+ rcu_torture_mem_dump_obj();
+
rcu_torture_stats_print(); /* -After- the stats thread is stopped! */
if (err_segs_recorded) {
@@ -3120,6 +3135,21 @@ rcu_torture_init(void)
if (firsterr < 0)
goto unwind;
rcutor_hp = firsterr;
+
+ // Testing RCU priority boosting requires rcutorture do
+ // some serious abuse. Counter this by running ksoftirqd
+ // at higher priority.
+ if (IS_BUILTIN(CONFIG_RCU_TORTURE_TEST)) {
+ for_each_online_cpu(cpu) {
+ struct sched_param sp;
+ struct task_struct *t;
+
+ t = per_cpu(ksoftirqd, cpu);
+ WARN_ON_ONCE(!t);
+ sp.sched_priority = 2;
+ sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
+ }
+ }
}
shutdown_jiffies = jiffies + shutdown_secs * HZ;
firsterr = torture_shutdown_init(shutdown_secs, rcu_torture_cleanup);
diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c
index 02dd9767b5591..9eb8672a3213a 100644
--- a/kernel/rcu/refscale.c
+++ b/kernel/rcu/refscale.c
@@ -362,6 +362,146 @@ static struct ref_scale_ops rwsem_ops = {
.name = "rwsem"
};
+// Definitions for global spinlock
+static DEFINE_SPINLOCK(test_lock);
+
+static void ref_lock_section(const int nloops)
+{
+ int i;
+
+ preempt_disable();
+ for (i = nloops; i >= 0; i--) {
+ spin_lock(&test_lock);
+ spin_unlock(&test_lock);
+ }
+ preempt_enable();
+}
+
+static void ref_lock_delay_section(const int nloops, const int udl, const int ndl)
+{
+ int i;
+
+ preempt_disable();
+ for (i = nloops; i >= 0; i--) {
+ spin_lock(&test_lock);
+ un_delay(udl, ndl);
+ spin_unlock(&test_lock);
+ }
+ preempt_enable();
+}
+
+static struct ref_scale_ops lock_ops = {
+ .readsection = ref_lock_section,
+ .delaysection = ref_lock_delay_section,
+ .name = "lock"
+};
+
+// Definitions for global irq-save spinlock
+
+static void ref_lock_irq_section(const int nloops)
+{
+ unsigned long flags;
+ int i;
+
+ preempt_disable();
+ for (i = nloops; i >= 0; i--) {
+ spin_lock_irqsave(&test_lock, flags);
+ spin_unlock_irqrestore(&test_lock, flags);
+ }
+ preempt_enable();
+}
+
+static void ref_lock_irq_delay_section(const int nloops, const int udl, const int ndl)
+{
+ unsigned long flags;
+ int i;
+
+ preempt_disable();
+ for (i = nloops; i >= 0; i--) {
+ spin_lock_irqsave(&test_lock, flags);
+ un_delay(udl, ndl);
+ spin_unlock_irqrestore(&test_lock, flags);
+ }
+ preempt_enable();
+}
+
+static struct ref_scale_ops lock_irq_ops = {
+ .readsection = ref_lock_irq_section,
+ .delaysection = ref_lock_irq_delay_section,
+ .name = "lock-irq"
+};
+
+// Definitions acquire-release.
+static DEFINE_PER_CPU(unsigned long, test_acqrel);
+
+static void ref_acqrel_section(const int nloops)
+{
+ unsigned long x;
+ int i;
+
+ preempt_disable();
+ for (i = nloops; i >= 0; i--) {
+ x = smp_load_acquire(this_cpu_ptr(&test_acqrel));
+ smp_store_release(this_cpu_ptr(&test_acqrel), x + 1);
+ }
+ preempt_enable();
+}
+
+static void ref_acqrel_delay_section(const int nloops, const int udl, const int ndl)
+{
+ unsigned long x;
+ int i;
+
+ preempt_disable();
+ for (i = nloops; i >= 0; i--) {
+ x = smp_load_acquire(this_cpu_ptr(&test_acqrel));
+ un_delay(udl, ndl);
+ smp_store_release(this_cpu_ptr(&test_acqrel), x + 1);
+ }
+ preempt_enable();
+}
+
+static struct ref_scale_ops acqrel_ops = {
+ .readsection = ref_acqrel_section,
+ .delaysection = ref_acqrel_delay_section,
+ .name = "acqrel"
+};
+
+static volatile u64 stopopts;
+
+static void ref_clock_section(const int nloops)
+{
+ u64 x = 0;
+ int i;
+
+ preempt_disable();
+ for (i = nloops; i >= 0; i--) {
+ x += ktime_get_real_fast_ns();
+ }
+ preempt_enable();
+ stopopts = x;
+}
+
+static void ref_clock_delay_section(const int nloops, const int udl, const int ndl)
+{
+ u64 x = 0;
+ int i;
+
+ preempt_disable();
+ for (i = nloops; i >= 0; i--) {
+ x += ktime_get_real_fast_ns();
+ un_delay(udl, ndl);
+ }
+ preempt_enable();
+ stopopts = x;
+}
+
+static struct ref_scale_ops clock_ops = {
+ .readsection = ref_clock_section,
+ .delaysection = ref_clock_delay_section,
+ .name = "clock"
+};
+
static void rcu_scale_one_reader(void)
{
if (readdelay <= 0)
@@ -653,8 +793,8 @@ ref_scale_init(void)
long i;
int firsterr = 0;
static struct ref_scale_ops *scale_ops[] = {
- &rcu_ops, &srcu_ops, &rcu_trace_ops, &rcu_tasks_ops,
- &refcnt_ops, &rwlock_ops, &rwsem_ops,
+ &rcu_ops, &srcu_ops, &rcu_trace_ops, &rcu_tasks_ops, &refcnt_ops, &rwlock_ops,
+ &rwsem_ops, &lock_ops, &lock_irq_ops, &acqrel_ops, &clock_ops,
};
if (!torture_init_begin(scale_type, verbose))
diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c
index 26344dc6483b0..a0ba2ed49bc61 100644
--- a/kernel/rcu/srcutiny.c
+++ b/kernel/rcu/srcutiny.c
@@ -96,7 +96,7 @@ EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
*/
void __srcu_read_unlock(struct srcu_struct *ssp, int idx)
{
- int newval = ssp->srcu_lock_nesting[idx] - 1;
+ int newval = READ_ONCE(ssp->srcu_lock_nesting[idx]) - 1;
WRITE_ONCE(ssp->srcu_lock_nesting[idx], newval);
if (!newval && READ_ONCE(ssp->srcu_gp_waiting))
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index e26547b34ad33..6833d88871816 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -80,7 +80,7 @@ do { \
* srcu_read_unlock() running against them. So if the is_static parameter
* is set, don't initialize ->srcu_lock_count[] and ->srcu_unlock_count[].
*/
-static void init_srcu_struct_nodes(struct srcu_struct *ssp, bool is_static)
+static void init_srcu_struct_nodes(struct srcu_struct *ssp)
{
int cpu;
int i;
@@ -90,6 +90,9 @@ static void init_srcu_struct_nodes(struct srcu_struct *ssp, bool is_static)
struct srcu_node *snp;
struct srcu_node *snp_first;
+ /* Initialize geometry if it has not already been initialized. */
+ rcu_init_geometry();
+
/* Work out the overall tree geometry. */
ssp->level[0] = &ssp->node[0];
for (i = 1; i < rcu_num_lvls; i++)
@@ -148,14 +151,6 @@ static void init_srcu_struct_nodes(struct srcu_struct *ssp, bool is_static)
timer_setup(&sdp->delay_work, srcu_delay_timer, 0);
sdp->ssp = ssp;
sdp->grpmask = 1 << (cpu - sdp->mynode->grplo);
- if (is_static)
- continue;
-
- /* Dynamically allocated, better be no srcu_read_locks()! */
- for (i = 0; i < ARRAY_SIZE(sdp->srcu_lock_count); i++) {
- sdp->srcu_lock_count[i] = 0;
- sdp->srcu_unlock_count[i] = 0;
- }
}
}
@@ -179,7 +174,7 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static)
ssp->sda = alloc_percpu(struct srcu_data);
if (!ssp->sda)
return -ENOMEM;
- init_srcu_struct_nodes(ssp, is_static);
+ init_srcu_struct_nodes(ssp);
ssp->srcu_gp_seq_needed_exp = 0;
ssp->srcu_last_gp_end = ktime_get_mono_fast_ns();
smp_store_release(&ssp->srcu_gp_seq_needed, 0); /* Init done. */
@@ -777,9 +772,9 @@ static bool srcu_might_be_idle(struct srcu_struct *ssp)
spin_unlock_irqrestore_rcu_node(sdp, flags);
/*
- * No local callbacks, so probabalistically probe global state.
+ * No local callbacks, so probabilistically probe global state.
* Exact information would require acquiring locks, which would
- * kill scalability, hence the probabalistic nature of the probe.
+ * kill scalability, hence the probabilistic nature of the probe.
*/
/* First, see if enough time has passed since the last GP. */
@@ -1000,6 +995,9 @@ EXPORT_SYMBOL_GPL(synchronize_srcu_expedited);
* synchronize_srcu(), srcu_read_lock(), and srcu_read_unlock() are
* passed the same srcu_struct structure.
*
+ * Implementation of these memory-ordering guarantees is similar to
+ * that of synchronize_rcu().
+ *
* If SRCU is likely idle, expedite the first request. This semantic
* was provided by Classic SRCU, and is relied upon by its users, so TREE
* SRCU must also provide it. Note that detecting idleness is heuristic
@@ -1392,11 +1390,15 @@ void __init srcu_init(void)
{
struct srcu_struct *ssp;
+ /*
+ * Once that is set, call_srcu() can follow the normal path and
+ * queue delayed work. This must follow RCU workqueues creation
+ * and timers initialization.
+ */
srcu_init_done = true;
while (!list_empty(&srcu_boot_list)) {
ssp = list_first_entry(&srcu_boot_list, struct srcu_struct,
work.work.entry);
- check_init_srcu_struct(ssp);
list_del_init(&ssp->work.work.entry);
queue_work(rcu_gp_wq, &ssp->work.work);
}
diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c
index d4558ab7a07d7..33d896d859023 100644
--- a/kernel/rcu/sync.c
+++ b/kernel/rcu/sync.c
@@ -94,9 +94,9 @@ static void rcu_sync_func(struct rcu_head *rhp)
rcu_sync_call(rsp);
} else {
/*
- * We're at least a GP after the last rcu_sync_exit(); eveybody
+ * We're at least a GP after the last rcu_sync_exit(); everybody
* will now have observed the write side critical section.
- * Let 'em rip!.
+ * Let 'em rip!
*/
WRITE_ONCE(rsp->gp_state, GP_IDLE);
}
diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index 350ebf5051f97..1cece5e9be9a9 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -23,7 +23,7 @@ typedef void (*postgp_func_t)(struct rcu_tasks *rtp);
* struct rcu_tasks - Definition for a Tasks-RCU-like mechanism.
* @cbs_head: Head of callback list.
* @cbs_tail: Tail pointer for callback list.
- * @cbs_wq: Wait queue allowning new callback to get kthread's attention.
+ * @cbs_wq: Wait queue allowing new callback to get kthread's attention.
* @cbs_lock: Lock protecting callback list.
* @kthread_ptr: This flavor's grace-period/callback-invocation kthread.
* @gp_func: This flavor's grace-period-wait function.
@@ -377,6 +377,46 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp)
// Finally, this implementation does not support high call_rcu_tasks()
// rates from multiple CPUs. If this is required, per-CPU callback lists
// will be needed.
+//
+// The implementation uses rcu_tasks_wait_gp(), which relies on function
+// pointers in the rcu_tasks structure. The rcu_spawn_tasks_kthread()
+// function sets these function pointers up so that rcu_tasks_wait_gp()
+// invokes these functions in this order:
+//
+// rcu_tasks_pregp_step():
+// Invokes synchronize_rcu() in order to wait for all in-flight
+// t->on_rq and t->nvcsw transitions to complete. This works because
+// all such transitions are carried out with interrupts disabled.
+// rcu_tasks_pertask(), invoked on every non-idle task:
+// For every runnable non-idle task other than the current one, use
+// get_task_struct() to pin down that task, snapshot that task's
+// number of voluntary context switches, and add that task to the
+// holdout list.
+// rcu_tasks_postscan():
+// Invoke synchronize_srcu() to ensure that all tasks that were
+// in the process of exiting (and which thus might not know to
+// synchronize with this RCU Tasks grace period) have completed
+// exiting.
+// check_all_holdout_tasks(), repeatedly until holdout list is empty:
+// Scans the holdout list, attempting to identify a quiescent state
+// for each task on the list. If there is a quiescent state, the
+// corresponding task is removed from the holdout list.
+// rcu_tasks_postgp():
+// Invokes synchronize_rcu() in order to ensure that all prior
+// t->on_rq and t->nvcsw transitions are seen by all CPUs and tasks
+// to have happened before the end of this RCU Tasks grace period.
+// Again, this works because all such transitions are carried out
+// with interrupts disabled.
+//
+// For each exiting task, the exit_tasks_rcu_start() and
+// exit_tasks_rcu_finish() functions begin and end, respectively, the SRCU
+// read-side critical sections waited for by rcu_tasks_postscan().
+//
+// Pre-grace-period update-side code is ordered before the grace via the
+// ->cbs_lock and the smp_mb__after_spinlock(). Pre-grace-period read-side
+// code is ordered before the grace period via synchronize_rcu() call
+// in rcu_tasks_pregp_step() and by the scheduler's locks and interrupt
+// disabling.
/* Pre-grace-period preparation. */
static void rcu_tasks_pregp_step(void)
@@ -504,7 +544,7 @@ DEFINE_RCU_TASKS(rcu_tasks, rcu_tasks_wait_gp, call_rcu_tasks, "RCU Tasks");
* or transition to usermode execution. As such, there are no read-side
* primitives analogous to rcu_read_lock() and rcu_read_unlock() because
* this primitive is intended to determine that all tasks have passed
- * through a safe state, not so much for data-strcuture synchronization.
+ * through a safe state, not so much for data-structure synchronization.
*
* See the description of call_rcu() for more detailed information on
* memory ordering guarantees.
@@ -605,8 +645,13 @@ void exit_tasks_rcu_finish(void) { exit_tasks_rcu_finish_trace(current); }
// passing an empty function to schedule_on_each_cpu(). This approach
// provides an asynchronous call_rcu_tasks_rude() API and batching
// of concurrent calls to the synchronous synchronize_rcu_rude() API.
-// This sends IPIs far and wide and induces otherwise unnecessary context
-// switches on all online CPUs, whether idle or not.
+// This invokes schedule_on_each_cpu() in order to send IPIs far and wide
+// and induces otherwise unnecessary context switches on all online CPUs,
+// whether idle or not.
+//
+// Callback handling is provided by the rcu_tasks_kthread() function.
+//
+// Ordering is provided by the scheduler's context-switch code.
// Empty function to allow workqueues to force a context switch.
static void rcu_tasks_be_rude(struct work_struct *work)
@@ -637,7 +682,7 @@ DEFINE_RCU_TASKS(rcu_tasks_rude, rcu_tasks_rude_wait_gp, call_rcu_tasks_rude,
* there are no read-side primitives analogous to rcu_read_lock() and
* rcu_read_unlock() because this primitive is intended to determine
* that all tasks have passed through a safe state, not so much for
- * data-strcuture synchronization.
+ * data-structure synchronization.
*
* See the description of call_rcu() for more detailed information on
* memory ordering guarantees.
@@ -740,7 +785,10 @@ EXPORT_SYMBOL_GPL(show_rcu_tasks_rude_gp_kthread);
// set that task's .need_qs flag so that task's next outermost
// rcu_read_unlock_trace() will report the quiescent state (in which
// case the count of readers is incremented). If both attempts fail,
-// the task is added to a "holdout" list.
+// the task is added to a "holdout" list. Note that IPIs are used
+// to invoke trc_read_check_handler() in the context of running tasks
+// in order to avoid ordering overhead on common-case shared-variable
+// accessses.
// rcu_tasks_trace_postscan():
// Initialize state and attempt to identify an immediate quiescent
// state as above (but only for idle tasks), unblock CPU-hotplug
@@ -802,7 +850,7 @@ static DEFINE_IRQ_WORK(rcu_tasks_trace_iw, rcu_read_unlock_iw);
/* If we are the last reader, wake up the grace-period kthread. */
void rcu_read_unlock_trace_special(struct task_struct *t, int nesting)
{
- int nq = t->trc_reader_special.b.need_qs;
+ int nq = READ_ONCE(t->trc_reader_special.b.need_qs);
if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB) &&
t->trc_reader_special.b.need_mb)
@@ -849,7 +897,7 @@ static void trc_read_check_handler(void *t_in)
// If the task is not in a read-side critical section, and
// if this is the last reader, awaken the grace-period kthread.
- if (likely(!t->trc_reader_nesting)) {
+ if (likely(!READ_ONCE(t->trc_reader_nesting))) {
if (WARN_ON_ONCE(atomic_dec_and_test(&trc_n_readers_need_end)))
wake_up(&trc_wait);
// Mark as checked after decrement to avoid false
@@ -858,7 +906,7 @@ static void trc_read_check_handler(void *t_in)
goto reset_ipi;
}
// If we are racing with an rcu_read_unlock_trace(), try again later.
- if (unlikely(t->trc_reader_nesting < 0)) {
+ if (unlikely(READ_ONCE(t->trc_reader_nesting) < 0)) {
if (WARN_ON_ONCE(atomic_dec_and_test(&trc_n_readers_need_end)))
wake_up(&trc_wait);
goto reset_ipi;
@@ -868,7 +916,7 @@ static void trc_read_check_handler(void *t_in)
// Get here if the task is in a read-side critical section. Set
// its state so that it will awaken the grace-period kthread upon
// exit from that critical section.
- WARN_ON_ONCE(t->trc_reader_special.b.need_qs);
+ WARN_ON_ONCE(READ_ONCE(t->trc_reader_special.b.need_qs));
WRITE_ONCE(t->trc_reader_special.b.need_qs, true);
reset_ipi:
@@ -905,13 +953,13 @@ static bool trc_inspect_reader(struct task_struct *t, void *arg)
n_heavy_reader_ofl_updates++;
in_qs = true;
} else {
+ // The task is not running, so C-language access is safe.
in_qs = likely(!t->trc_reader_nesting);
}
- // Mark as checked. Because this is called from the grace-period
- // kthread, also remove the task from the holdout list.
+ // Mark as checked so that the grace-period kthread will
+ // remove it from the holdout list.
t->trc_reader_checked = true;
- trc_del_holdout(t);
if (in_qs)
return true; // Already in quiescent state, done!!!
@@ -920,7 +968,7 @@ static bool trc_inspect_reader(struct task_struct *t, void *arg)
// state so that it will awaken the grace-period kthread upon exit
// from that critical section.
atomic_inc(&trc_n_readers_need_end); // One more to wait on.
- WARN_ON_ONCE(t->trc_reader_special.b.need_qs);
+ WARN_ON_ONCE(READ_ONCE(t->trc_reader_special.b.need_qs));
WRITE_ONCE(t->trc_reader_special.b.need_qs, true);
return true;
}
@@ -938,8 +986,7 @@ static void trc_wait_for_one_reader(struct task_struct *t,
// The current task had better be in a quiescent state.
if (t == current) {
t->trc_reader_checked = true;
- trc_del_holdout(t);
- WARN_ON_ONCE(t->trc_reader_nesting);
+ WARN_ON_ONCE(READ_ONCE(t->trc_reader_nesting));
return;
}
@@ -951,6 +998,12 @@ static void trc_wait_for_one_reader(struct task_struct *t,
}
put_task_struct(t);
+ // If this task is not yet on the holdout list, then we are in
+ // an RCU read-side critical section. Otherwise, the invocation of
+ // rcu_add_holdout() that added it to the list did the necessary
+ // get_task_struct(). Either way, the task cannot be freed out
+ // from under this code.
+
// If currently running, send an IPI, either way, add to list.
trc_add_holdout(t, bhp);
if (task_curr(t) &&
@@ -1049,8 +1102,8 @@ static void show_stalled_task_trace(struct task_struct *t, bool *firstreport)
".I"[READ_ONCE(t->trc_ipi_to_cpu) > 0],
".i"[is_idle_task(t)],
".N"[cpu > 0 && tick_nohz_full_cpu(cpu)],
- t->trc_reader_nesting,
- " N"[!!t->trc_reader_special.b.need_qs],
+ READ_ONCE(t->trc_reader_nesting),
+ " N"[!!READ_ONCE(t->trc_reader_special.b.need_qs)],
cpu);
sched_show_task(t);
}
@@ -1144,7 +1197,7 @@ static void rcu_tasks_trace_postgp(struct rcu_tasks *rtp)
static void exit_tasks_rcu_finish_trace(struct task_struct *t)
{
WRITE_ONCE(t->trc_reader_checked, true);
- WARN_ON_ONCE(t->trc_reader_nesting);
+ WARN_ON_ONCE(READ_ONCE(t->trc_reader_nesting));
WRITE_ONCE(t->trc_reader_nesting, 0);
if (WARN_ON_ONCE(READ_ONCE(t->trc_reader_special.b.need_qs)))
rcu_read_unlock_trace_special(t, 0);
@@ -1163,7 +1216,7 @@ static void exit_tasks_rcu_finish_trace(struct task_struct *t)
* there are no read-side primitives analogous to rcu_read_lock() and
* rcu_read_unlock() because this primitive is intended to determine
* that all tasks have passed through a safe state, not so much for
- * data-strcuture synchronization.
+ * data-structure synchronization.
*
* See the description of call_rcu() for more detailed information on
* memory ordering guarantees.
@@ -1356,5 +1409,4 @@ void __init rcu_init_tasks_generic(void)
#else /* #ifdef CONFIG_TASKS_RCU_GENERIC */
static inline void rcu_tasks_bootup_oddness(void) {}
-void show_rcu_tasks_gp_kthreads(void) {}
#endif /* #else #ifdef CONFIG_TASKS_RCU_GENERIC */
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index c8a029fbb1143..340b3f8b090d4 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -221,5 +221,4 @@ void __init rcu_init(void)
{
open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
rcu_early_boot_tests();
- srcu_init();
}
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 8e78b2430c168..13bd8eee62bff 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -72,17 +72,10 @@
/* Data structures. */
-/*
- * Steal a bit from the bottom of ->dynticks for idle entry/exit
- * control. Initially this is for TLB flushing.
- */
-#define RCU_DYNTICK_CTRL_MASK 0x1
-#define RCU_DYNTICK_CTRL_CTR (RCU_DYNTICK_CTRL_MASK + 1)
-
static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, rcu_data) = {
.dynticks_nesting = 1,
.dynticks_nmi_nesting = DYNTICK_IRQ_NONIDLE,
- .dynticks = ATOMIC_INIT(RCU_DYNTICK_CTRL_CTR),
+ .dynticks = 1UL,
#ifdef CONFIG_RCU_NOCB_CPU
.cblist.flags = SEGCBLIST_SOFTIRQ_ONLY,
#endif
@@ -186,6 +179,17 @@ module_param(rcu_unlock_delay, int, 0444);
static int rcu_min_cached_objs = 5;
module_param(rcu_min_cached_objs, int, 0444);
+// A page shrinker can ask for pages to be freed to make them
+// available for other parts of the system. This usually happens
+// under low memory conditions, and in that case we should also
+// defer page-cache filling for a short time period.
+//
+// The default value is 5 seconds, which is long enough to reduce
+// interference with the shrinker while it asks other systems to
+// drain their caches.
+static int rcu_delay_page_cache_fill_msec = 5000;
+module_param(rcu_delay_page_cache_fill_msec, int, 0444);
+
/* Retrieve RCU kthreads priority for rcutorture */
int rcu_get_gp_kthreads_prio(void)
{
@@ -202,7 +206,7 @@ EXPORT_SYMBOL_GPL(rcu_get_gp_kthreads_prio);
* the need for long delays to increase some race probabilities with the
* need for fast grace periods to increase other race probabilities.
*/
-#define PER_RCU_NODE_PERIOD 3 /* Number of grace periods between delays. */
+#define PER_RCU_NODE_PERIOD 3 /* Number of grace periods between delays for debugging. */
/*
* Compute the mask of online CPUs for the specified rcu_node structure.
@@ -242,6 +246,22 @@ void rcu_softirq_qs(void)
{
rcu_qs();
rcu_preempt_deferred_qs(current);
+ rcu_tasks_qs(current, false);
+}
+
+/*
+ * Increment the current CPU's rcu_data structure's ->dynticks field
+ * with ordering. Return the new value.
+ */
+static noinstr unsigned long rcu_dynticks_inc(int incby)
+{
+ struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
+ int seq;
+
+ seq = READ_ONCE(rdp->dynticks) + incby;
+ smp_store_release(&rdp->dynticks, seq);
+ smp_mb(); // Fundamental RCU ordering guarantee.
+ return seq;
}
/*
@@ -252,7 +272,6 @@ void rcu_softirq_qs(void)
*/
static noinstr void rcu_dynticks_eqs_enter(void)
{
- struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
int seq;
/*
@@ -261,13 +280,9 @@ static noinstr void rcu_dynticks_eqs_enter(void)
* next idle sojourn.
*/
rcu_dynticks_task_trace_enter(); // Before ->dynticks update!
- seq = arch_atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdp->dynticks);
+ seq = rcu_dynticks_inc(1);
// RCU is no longer watching. Better be in extended quiescent state!
- WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
- (seq & RCU_DYNTICK_CTRL_CTR));
- /* Better not have special action (TLB flush) pending! */
- WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
- (seq & RCU_DYNTICK_CTRL_MASK));
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && (seq & 0x1));
}
/*
@@ -277,7 +292,6 @@ static noinstr void rcu_dynticks_eqs_enter(void)
*/
static noinstr void rcu_dynticks_eqs_exit(void)
{
- struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
int seq;
/*
@@ -285,15 +299,10 @@ static noinstr void rcu_dynticks_eqs_exit(void)
* and we also must force ordering with the next RCU read-side
* critical section.
*/
- seq = arch_atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdp->dynticks);
+ seq = rcu_dynticks_inc(1);
// RCU is now watching. Better not be in an extended quiescent state!
rcu_dynticks_task_trace_exit(); // After ->dynticks update!
- WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
- !(seq & RCU_DYNTICK_CTRL_CTR));
- if (seq & RCU_DYNTICK_CTRL_MASK) {
- arch_atomic_andnot(RCU_DYNTICK_CTRL_MASK, &rdp->dynticks);
- smp_mb__after_atomic(); /* _exit after clearing mask. */
- }
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !(seq & 0x1));
}
/*
@@ -310,9 +319,9 @@ static void rcu_dynticks_eqs_online(void)
{
struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
- if (atomic_read(&rdp->dynticks) & RCU_DYNTICK_CTRL_CTR)
+ if (READ_ONCE(rdp->dynticks) & 0x1)
return;
- atomic_add(RCU_DYNTICK_CTRL_CTR, &rdp->dynticks);
+ rcu_dynticks_inc(1);
}
/*
@@ -322,9 +331,7 @@ static void rcu_dynticks_eqs_online(void)
*/
static __always_inline bool rcu_dynticks_curr_cpu_in_eqs(void)
{
- struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
-
- return !(arch_atomic_read(&rdp->dynticks) & RCU_DYNTICK_CTRL_CTR);
+ return !(READ_ONCE(this_cpu_ptr(&rcu_data)->dynticks) & 0x1);
}
/*
@@ -333,9 +340,8 @@ static __always_inline bool rcu_dynticks_curr_cpu_in_eqs(void)
*/
static int rcu_dynticks_snap(struct rcu_data *rdp)
{
- int snap = atomic_add_return(0, &rdp->dynticks);
-
- return snap & ~RCU_DYNTICK_CTRL_MASK;
+ smp_mb(); // Fundamental RCU ordering guarantee.
+ return smp_load_acquire(&rdp->dynticks);
}
/*
@@ -344,7 +350,7 @@ static int rcu_dynticks_snap(struct rcu_data *rdp)
*/
static bool rcu_dynticks_in_eqs(int snap)
{
- return !(snap & RCU_DYNTICK_CTRL_CTR);
+ return !(snap & 0x1);
}
/* Return true if the specified CPU is currently idle from an RCU viewpoint. */
@@ -375,8 +381,7 @@ bool rcu_dynticks_zero_in_eqs(int cpu, int *vp)
int snap;
// If not quiescent, force back to earlier extended quiescent state.
- snap = atomic_read(&rdp->dynticks) & ~(RCU_DYNTICK_CTRL_MASK |
- RCU_DYNTICK_CTRL_CTR);
+ snap = READ_ONCE(rdp->dynticks) & ~0x1;
smp_rmb(); // Order ->dynticks and *vp reads.
if (READ_ONCE(*vp))
@@ -384,32 +389,7 @@ bool rcu_dynticks_zero_in_eqs(int cpu, int *vp)
smp_rmb(); // Order *vp read and ->dynticks re-read.
// If still in the same extended quiescent state, we are good!
- return snap == (atomic_read(&rdp->dynticks) & ~RCU_DYNTICK_CTRL_MASK);
-}
-
-/*
- * Set the special (bottom) bit of the specified CPU so that it
- * will take special action (such as flushing its TLB) on the
- * next exit from an extended quiescent state. Returns true if
- * the bit was successfully set, or false if the CPU was not in
- * an extended quiescent state.
- */
-bool rcu_eqs_special_set(int cpu)
-{
- int old;
- int new;
- int new_old;
- struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
-
- new_old = atomic_read(&rdp->dynticks);
- do {
- old = new_old;
- if (old & RCU_DYNTICK_CTRL_CTR)
- return false;
- new = old | RCU_DYNTICK_CTRL_MASK;
- new_old = atomic_cmpxchg(&rdp->dynticks, old, new);
- } while (new_old != old);
- return true;
+ return snap == READ_ONCE(rdp->dynticks);
}
/*
@@ -425,13 +405,12 @@ bool rcu_eqs_special_set(int cpu)
*/
notrace void rcu_momentary_dyntick_idle(void)
{
- int special;
+ int seq;
raw_cpu_write(rcu_data.rcu_need_heavy_qs, false);
- special = atomic_add_return(2 * RCU_DYNTICK_CTRL_CTR,
- &this_cpu_ptr(&rcu_data)->dynticks);
+ seq = rcu_dynticks_inc(2);
/* It is illegal to call this from idle state. */
- WARN_ON_ONCE(!(special & RCU_DYNTICK_CTRL_CTR));
+ WARN_ON_ONCE(!(seq & 0x1));
rcu_preempt_deferred_qs(current);
}
EXPORT_SYMBOL_GPL(rcu_momentary_dyntick_idle);
@@ -647,7 +626,7 @@ static noinstr void rcu_eqs_enter(bool user)
lockdep_assert_irqs_disabled();
instrumentation_begin();
- trace_rcu_dyntick(TPS("Start"), rdp->dynticks_nesting, 0, atomic_read(&rdp->dynticks));
+ trace_rcu_dyntick(TPS("Start"), rdp->dynticks_nesting, 0, READ_ONCE(rdp->dynticks));
WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current));
rcu_prepare_for_idle();
rcu_preempt_deferred_qs(current);
@@ -782,7 +761,7 @@ noinstr void rcu_nmi_exit(void)
*/
if (rdp->dynticks_nmi_nesting != 1) {
trace_rcu_dyntick(TPS("--="), rdp->dynticks_nmi_nesting, rdp->dynticks_nmi_nesting - 2,
- atomic_read(&rdp->dynticks));
+ READ_ONCE(rdp->dynticks));
WRITE_ONCE(rdp->dynticks_nmi_nesting, /* No store tearing. */
rdp->dynticks_nmi_nesting - 2);
instrumentation_end();
@@ -790,7 +769,7 @@ noinstr void rcu_nmi_exit(void)
}
/* This NMI interrupted an RCU-idle CPU, restore RCU-idleness. */
- trace_rcu_dyntick(TPS("Startirq"), rdp->dynticks_nmi_nesting, 0, atomic_read(&rdp->dynticks));
+ trace_rcu_dyntick(TPS("Startirq"), rdp->dynticks_nmi_nesting, 0, READ_ONCE(rdp->dynticks));
WRITE_ONCE(rdp->dynticks_nmi_nesting, 0); /* Avoid store tearing. */
if (!in_nmi())
@@ -833,28 +812,6 @@ void noinstr rcu_irq_exit(void)
rcu_nmi_exit();
}
-/**
- * rcu_irq_exit_preempt - Inform RCU that current CPU is exiting irq
- * towards in kernel preemption
- *
- * Same as rcu_irq_exit() but has a sanity check that scheduling is safe
- * from RCU point of view. Invoked from return from interrupt before kernel
- * preemption.
- */
-void rcu_irq_exit_preempt(void)
-{
- lockdep_assert_irqs_disabled();
- rcu_nmi_exit();
-
- RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nesting) <= 0,
- "RCU dynticks_nesting counter underflow/zero!");
- RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nmi_nesting) !=
- DYNTICK_IRQ_NONIDLE,
- "Bad RCU dynticks_nmi_nesting counter\n");
- RCU_LOCKDEP_WARN(rcu_dynticks_curr_cpu_in_eqs(),
- "RCU in extended quiescent state!");
-}
-
#ifdef CONFIG_PROVE_RCU
/**
* rcu_irq_exit_check_preempt - Validate that scheduling is possible
@@ -920,7 +877,7 @@ static void noinstr rcu_eqs_exit(bool user)
instrument_atomic_write(&rdp->dynticks, sizeof(rdp->dynticks));
rcu_cleanup_after_idle();
- trace_rcu_dyntick(TPS("End"), rdp->dynticks_nesting, 1, atomic_read(&rdp->dynticks));
+ trace_rcu_dyntick(TPS("End"), rdp->dynticks_nesting, 1, READ_ONCE(rdp->dynticks));
WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current));
WRITE_ONCE(rdp->dynticks_nesting, 1);
WARN_ON_ONCE(rdp->dynticks_nmi_nesting);
@@ -959,7 +916,7 @@ EXPORT_SYMBOL_GPL(rcu_idle_exit);
*/
void noinstr rcu_user_exit(void)
{
- rcu_eqs_exit(1);
+ rcu_eqs_exit(true);
}
/**
@@ -1083,7 +1040,7 @@ noinstr void rcu_nmi_enter(void)
trace_rcu_dyntick(incby == 1 ? TPS("Endirq") : TPS("++="),
rdp->dynticks_nmi_nesting,
- rdp->dynticks_nmi_nesting + incby, atomic_read(&rdp->dynticks));
+ rdp->dynticks_nmi_nesting + incby, READ_ONCE(rdp->dynticks));
instrumentation_end();
WRITE_ONCE(rdp->dynticks_nmi_nesting, /* Prevent store tearing. */
rdp->dynticks_nmi_nesting + incby);
@@ -1225,7 +1182,7 @@ EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online);
#endif /* #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) */
/*
- * We are reporting a quiescent state on behalf of some other CPU, so
+ * When trying to report a quiescent state on behalf of some other CPU,
* it is our responsibility to check for and handle potential overflow
* of the rcu_node ->gp_seq counter with respect to the rcu_data counters.
* After all, the CPU might be in deep idle state, and thus executing no
@@ -2048,7 +2005,7 @@ static void rcu_gp_fqs_loop(void)
/*
* Clean up after the old grace period.
*/
-static void rcu_gp_cleanup(void)
+static noinline void rcu_gp_cleanup(void)
{
int cpu;
bool needgp = false;
@@ -2479,9 +2436,6 @@ int rcutree_dead_cpu(unsigned int cpu)
WRITE_ONCE(rcu_state.n_online_cpus, rcu_state.n_online_cpus - 1);
/* Adjust any no-longer-needed kthreads. */
rcu_boost_kthread_setaffinity(rnp, -1);
- /* Do any needed no-CB deferred wakeups from this CPU. */
- do_nocb_deferred_wakeup(per_cpu_ptr(&rcu_data, cpu));
-
// Stop-machine done, so allow nohz_full to disable tick.
tick_dep_clear(TICK_DEP_BIT_RCU);
return 0;
@@ -2489,7 +2443,7 @@ int rcutree_dead_cpu(unsigned int cpu)
/*
* Invoke any RCU callbacks that have made it to the end of their grace
- * period. Thottle as specified by rdp->blimit.
+ * period. Throttle as specified by rdp->blimit.
*/
static void rcu_do_batch(struct rcu_data *rdp)
{
@@ -2629,7 +2583,7 @@ static void rcu_do_batch(struct rcu_data *rdp)
* state, for example, user mode or idle loop. It also schedules RCU
* core processing. If the current grace period has gone on too long,
* it will ask the scheduler to manufacture a context switch for the sole
- * purpose of providing a providing the needed quiescent state.
+ * purpose of providing the needed quiescent state.
*/
void rcu_sched_clock_irq(int user)
{
@@ -2911,7 +2865,6 @@ static int __init rcu_spawn_core_kthreads(void)
"%s: Could not start rcuc kthread, OOM is now expected behavior\n", __func__);
return 0;
}
-early_initcall(rcu_spawn_core_kthreads);
/*
* Handle any core-RCU processing required by a call_rcu() invocation.
@@ -3082,12 +3035,14 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func)
* period elapses, in other words after all pre-existing RCU read-side
* critical sections have completed. However, the callback function
* might well execute concurrently with RCU read-side critical sections
- * that started after call_rcu() was invoked. RCU read-side critical
- * sections are delimited by rcu_read_lock() and rcu_read_unlock(), and
- * may be nested. In addition, regions of code across which interrupts,
- * preemption, or softirqs have been disabled also serve as RCU read-side
- * critical sections. This includes hardware interrupt handlers, softirq
- * handlers, and NMI handlers.
+ * that started after call_rcu() was invoked.
+ *
+ * RCU read-side critical sections are delimited by rcu_read_lock()
+ * and rcu_read_unlock(), and may be nested. In addition, but only in
+ * v5.0 and later, regions of code across which interrupts, preemption,
+ * or softirqs have been disabled also serve as RCU read-side critical
+ * sections. This includes hardware interrupt handlers, softirq handlers,
+ * and NMI handlers.
*
* Note that all CPUs must agree that the grace period extended beyond
* all pre-existing RCU read-side critical section. On systems with more
@@ -3107,6 +3062,9 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func)
* between the call to call_rcu() and the invocation of "func()" -- even
* if CPU A and CPU B are the same CPU (but again only if the system has
* more than one CPU).
+ *
+ * Implementation of these memory-ordering guarantees is described here:
+ * Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst.
*/
void call_rcu(struct rcu_head *head, rcu_callback_t func)
{
@@ -3171,6 +3129,7 @@ struct kfree_rcu_cpu_work {
* Even though it is lockless an access has to be protected by the
* per-cpu lock.
* @page_cache_work: A work to refill the cache when it is empty
+ * @backoff_page_cache_fill: Delay cache refills
* @work_in_progress: Indicates that page_cache_work is running
* @hrtimer: A hrtimer for scheduling a page_cache_work
* @nr_bkv_objs: number of allocated objects at @bkvcache.
@@ -3190,7 +3149,8 @@ struct kfree_rcu_cpu {
bool initialized;
int count;
- struct work_struct page_cache_work;
+ struct delayed_work page_cache_work;
+ atomic_t backoff_page_cache_fill;
atomic_t work_in_progress;
struct hrtimer hrtimer;
@@ -3237,7 +3197,7 @@ get_cached_bnode(struct kfree_rcu_cpu *krcp)
if (!krcp->nr_bkv_objs)
return NULL;
- krcp->nr_bkv_objs--;
+ WRITE_ONCE(krcp->nr_bkv_objs, krcp->nr_bkv_objs - 1);
return (struct kvfree_rcu_bulk_data *)
llist_del_first(&krcp->bkvcache);
}
@@ -3251,14 +3211,33 @@ put_cached_bnode(struct kfree_rcu_cpu *krcp,
return false;
llist_add((struct llist_node *) bnode, &krcp->bkvcache);
- krcp->nr_bkv_objs++;
+ WRITE_ONCE(krcp->nr_bkv_objs, krcp->nr_bkv_objs + 1);
return true;
+}
+
+static int
+drain_page_cache(struct kfree_rcu_cpu *krcp)
+{
+ unsigned long flags;
+ struct llist_node *page_list, *pos, *n;
+ int freed = 0;
+
+ raw_spin_lock_irqsave(&krcp->lock, flags);
+ page_list = llist_del_all(&krcp->bkvcache);
+ WRITE_ONCE(krcp->nr_bkv_objs, 0);
+ raw_spin_unlock_irqrestore(&krcp->lock, flags);
+ llist_for_each_safe(pos, n, page_list) {
+ free_page((unsigned long)pos);
+ freed++;
+ }
+
+ return freed;
}
/*
* This function is invoked in workqueue context after a grace period.
- * It frees all the objects queued on ->bhead_free or ->head_free.
+ * It frees all the objects queued on ->bkvhead_free or ->head_free.
*/
static void kfree_rcu_work(struct work_struct *work)
{
@@ -3285,7 +3264,7 @@ static void kfree_rcu_work(struct work_struct *work)
krwp->head_free = NULL;
raw_spin_unlock_irqrestore(&krcp->lock, flags);
- // Handle two first channels.
+ // Handle the first two channels.
for (i = 0; i < FREE_N_CHANNELS; i++) {
for (; bkvhead[i]; bkvhead[i] = bnext) {
bnext = bkvhead[i]->next;
@@ -3323,9 +3302,11 @@ static void kfree_rcu_work(struct work_struct *work)
}
/*
- * Emergency case only. It can happen under low memory
- * condition when an allocation gets failed, so the "bulk"
- * path can not be temporary maintained.
+ * This is used when the "bulk" path can not be used for the
+ * double-argument of kvfree_rcu(). This happens when the
+ * page-cache is empty, which means that objects are instead
+ * queued on a linked list through their rcu_head structures.
+ * This list is named "Channel 3".
*/
for (; head; head = next) {
unsigned long offset = (unsigned long)head->func;
@@ -3345,34 +3326,31 @@ static void kfree_rcu_work(struct work_struct *work)
}
/*
- * Schedule the kfree batch RCU work to run in workqueue context after a GP.
- *
- * This function is invoked by kfree_rcu_monitor() when the KFREE_DRAIN_JIFFIES
- * timeout has been reached.
+ * This function is invoked after the KFREE_DRAIN_JIFFIES timeout.
*/
-static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp)
+static void kfree_rcu_monitor(struct work_struct *work)
{
- struct kfree_rcu_cpu_work *krwp;
- bool repeat = false;
+ struct kfree_rcu_cpu *krcp = container_of(work,
+ struct kfree_rcu_cpu, monitor_work.work);
+ unsigned long flags;
int i, j;
- lockdep_assert_held(&krcp->lock);
+ raw_spin_lock_irqsave(&krcp->lock, flags);
+ // Attempt to start a new batch.
for (i = 0; i < KFREE_N_BATCHES; i++) {
- krwp = &(krcp->krw_arr[i]);
+ struct kfree_rcu_cpu_work *krwp = &(krcp->krw_arr[i]);
- /*
- * Try to detach bkvhead or head and attach it over any
- * available corresponding free channel. It can be that
- * a previous RCU batch is in progress, it means that
- * immediately to queue another one is not possible so
- * return false to tell caller to retry.
- */
+ // Try to detach bkvhead or head and attach it over any
+ // available corresponding free channel. It can be that
+ // a previous RCU batch is in progress, it means that
+ // immediately to queue another one is not possible so
+ // in that case the monitor work is rearmed.
if ((krcp->bkvhead[0] && !krwp->bkvhead_free[0]) ||
(krcp->bkvhead[1] && !krwp->bkvhead_free[1]) ||
(krcp->head && !krwp->head_free)) {
- // Channel 1 corresponds to SLAB ptrs.
- // Channel 2 corresponds to vmalloc ptrs.
+ // Channel 1 corresponds to the SLAB-pointer bulk path.
+ // Channel 2 corresponds to vmalloc-pointer bulk path.
for (j = 0; j < FREE_N_CHANNELS; j++) {
if (!krwp->bkvhead_free[j]) {
krwp->bkvhead_free[j] = krcp->bkvhead[j];
@@ -3380,7 +3358,8 @@ static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp)
}
}
- // Channel 3 corresponds to emergency path.
+ // Channel 3 corresponds to both SLAB and vmalloc
+ // objects queued on the linked list.
if (!krwp->head_free) {
krwp->head_free = krcp->head;
krcp->head = NULL;
@@ -3388,65 +3367,35 @@ static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp)
WRITE_ONCE(krcp->count, 0);
- /*
- * One work is per one batch, so there are three
- * "free channels", the batch can handle. It can
- * be that the work is in the pending state when
- * channels have been detached following by each
- * other.
- */
+ // One work is per one batch, so there are three
+ // "free channels", the batch can handle. It can
+ // be that the work is in the pending state when
+ // channels have been detached following by each
+ // other.
queue_rcu_work(system_wq, &krwp->rcu_work);
}
-
- // Repeat if any "free" corresponding channel is still busy.
- if (krcp->bkvhead[0] || krcp->bkvhead[1] || krcp->head)
- repeat = true;
}
- return !repeat;
-}
-
-static inline void kfree_rcu_drain_unlock(struct kfree_rcu_cpu *krcp,
- unsigned long flags)
-{
- // Attempt to start a new batch.
- krcp->monitor_todo = false;
- if (queue_kfree_rcu_work(krcp)) {
- // Success! Our job is done here.
- raw_spin_unlock_irqrestore(&krcp->lock, flags);
- return;
- }
+ // If there is nothing to detach, it means that our job is
+ // successfully done here. In case of having at least one
+ // of the channels that is still busy we should rearm the
+ // work to repeat an attempt. Because previous batches are
+ // still in progress.
+ if (!krcp->bkvhead[0] && !krcp->bkvhead[1] && !krcp->head)
+ krcp->monitor_todo = false;
+ else
+ schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES);
- // Previous RCU batch still in progress, try again later.
- krcp->monitor_todo = true;
- schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES);
raw_spin_unlock_irqrestore(&krcp->lock, flags);
}
-/*
- * This function is invoked after the KFREE_DRAIN_JIFFIES timeout.
- * It invokes kfree_rcu_drain_unlock() to attempt to start another batch.
- */
-static void kfree_rcu_monitor(struct work_struct *work)
-{
- unsigned long flags;
- struct kfree_rcu_cpu *krcp = container_of(work, struct kfree_rcu_cpu,
- monitor_work.work);
-
- raw_spin_lock_irqsave(&krcp->lock, flags);
- if (krcp->monitor_todo)
- kfree_rcu_drain_unlock(krcp, flags);
- else
- raw_spin_unlock_irqrestore(&krcp->lock, flags);
-}
-
static enum hrtimer_restart
schedule_page_work_fn(struct hrtimer *t)
{
struct kfree_rcu_cpu *krcp =
container_of(t, struct kfree_rcu_cpu, hrtimer);
- queue_work(system_highpri_wq, &krcp->page_cache_work);
+ queue_delayed_work(system_highpri_wq, &krcp->page_cache_work, 0);
return HRTIMER_NORESTART;
}
@@ -3455,12 +3404,16 @@ static void fill_page_cache_func(struct work_struct *work)
struct kvfree_rcu_bulk_data *bnode;
struct kfree_rcu_cpu *krcp =
container_of(work, struct kfree_rcu_cpu,
- page_cache_work);
+ page_cache_work.work);
unsigned long flags;
+ int nr_pages;
bool pushed;
int i;
- for (i = 0; i < rcu_min_cached_objs; i++) {
+ nr_pages = atomic_read(&krcp->backoff_page_cache_fill) ?
+ 1 : rcu_min_cached_objs;
+
+ for (i = 0; i < nr_pages; i++) {
bnode = (struct kvfree_rcu_bulk_data *)
__get_free_page(GFP_KERNEL | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
@@ -3477,6 +3430,7 @@ static void fill_page_cache_func(struct work_struct *work)
}
atomic_set(&krcp->work_in_progress, 0);
+ atomic_set(&krcp->backoff_page_cache_fill, 0);
}
static void
@@ -3484,10 +3438,15 @@ run_page_cache_worker(struct kfree_rcu_cpu *krcp)
{
if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING &&
!atomic_xchg(&krcp->work_in_progress, 1)) {
- hrtimer_init(&krcp->hrtimer, CLOCK_MONOTONIC,
- HRTIMER_MODE_REL);
- krcp->hrtimer.function = schedule_page_work_fn;
- hrtimer_start(&krcp->hrtimer, 0, HRTIMER_MODE_REL);
+ if (atomic_read(&krcp->backoff_page_cache_fill)) {
+ queue_delayed_work(system_wq,
+ &krcp->page_cache_work,
+ msecs_to_jiffies(rcu_delay_page_cache_fill_msec));
+ } else {
+ hrtimer_init(&krcp->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ krcp->hrtimer.function = schedule_page_work_fn;
+ hrtimer_start(&krcp->hrtimer, 0, HRTIMER_MODE_REL);
+ }
}
}
@@ -3552,11 +3511,11 @@ add_ptr_to_bulk_krc_lock(struct kfree_rcu_cpu **krcp,
}
/*
- * Queue a request for lazy invocation of appropriate free routine after a
- * grace period. Please note there are three paths are maintained, two are the
- * main ones that use array of pointers interface and third one is emergency
- * one, that is used only when the main path can not be maintained temporary,
- * due to memory pressure.
+ * Queue a request for lazy invocation of the appropriate free routine
+ * after a grace period. Please note that three paths are maintained,
+ * two for the common case using arrays of pointers and a third one that
+ * is used only when the main paths cannot be used, for example, due to
+ * memory pressure.
*
* Each kvfree_call_rcu() request is added to a batch. The batch will be drained
* every KFREE_DRAIN_JIFFIES number of jiffies. All the objects in the batch will
@@ -3645,6 +3604,8 @@ kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
count += READ_ONCE(krcp->count);
+ count += READ_ONCE(krcp->nr_bkv_objs);
+ atomic_set(&krcp->backoff_page_cache_fill, 1);
}
return count;
@@ -3654,18 +3615,14 @@ static unsigned long
kfree_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
{
int cpu, freed = 0;
- unsigned long flags;
for_each_possible_cpu(cpu) {
int count;
struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
count = krcp->count;
- raw_spin_lock_irqsave(&krcp->lock, flags);
- if (krcp->monitor_todo)
- kfree_rcu_drain_unlock(krcp, flags);
- else
- raw_spin_unlock_irqrestore(&krcp->lock, flags);
+ count += drain_page_cache(krcp);
+ kfree_rcu_monitor(&krcp->monitor_work.work);
sc->nr_to_scan -= count;
freed += count;
@@ -3693,7 +3650,8 @@ void __init kfree_rcu_scheduler_running(void)
struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
raw_spin_lock_irqsave(&krcp->lock, flags);
- if (!krcp->head || krcp->monitor_todo) {
+ if ((!krcp->bkvhead[0] && !krcp->bkvhead[1] && !krcp->head) ||
+ krcp->monitor_todo) {
raw_spin_unlock_irqrestore(&krcp->lock, flags);
continue;
}
@@ -3750,10 +3708,12 @@ static int rcu_blocking_is_gp(void)
* read-side critical sections have completed. Note, however, that
* upon return from synchronize_rcu(), the caller might well be executing
* concurrently with new RCU read-side critical sections that began while
- * synchronize_rcu() was waiting. RCU read-side critical sections are
- * delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested.
- * In addition, regions of code across which interrupts, preemption, or
- * softirqs have been disabled also serve as RCU read-side critical
+ * synchronize_rcu() was waiting.
+ *
+ * RCU read-side critical sections are delimited by rcu_read_lock()
+ * and rcu_read_unlock(), and may be nested. In addition, but only in
+ * v5.0 and later, regions of code across which interrupts, preemption,
+ * or softirqs have been disabled also serve as RCU read-side critical
* sections. This includes hardware interrupt handlers, softirq handlers,
* and NMI handlers.
*
@@ -3774,6 +3734,9 @@ static int rcu_blocking_is_gp(void)
* to have executed a full memory barrier during the execution of
* synchronize_rcu() -- even if CPU A and CPU B are the same CPU (but
* again only if the system has more than one CPU).
+ *
+ * Implementation of these memory-ordering guarantees is described here:
+ * Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst.
*/
void synchronize_rcu(void)
{
@@ -3844,11 +3807,11 @@ EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu);
/**
* poll_state_synchronize_rcu - Conditionally wait for an RCU grace period
*
- * @oldstate: return from call to get_state_synchronize_rcu() or start_poll_synchronize_rcu()
+ * @oldstate: value from get_state_synchronize_rcu() or start_poll_synchronize_rcu()
*
* If a full RCU grace period has elapsed since the earlier call from
* which oldstate was obtained, return @true, otherwise return @false.
- * If @false is returned, it is the caller's responsibilty to invoke this
+ * If @false is returned, it is the caller's responsibility to invoke this
* function later on until it does return @true. Alternatively, the caller
* can explicitly wait for a grace period, for example, by passing @oldstate
* to cond_synchronize_rcu() or by directly invoking synchronize_rcu().
@@ -3860,6 +3823,11 @@ EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu);
* (many hours even on 32-bit systems) should check them occasionally
* and either refresh them or set a flag indicating that the grace period
* has completed.
+ *
+ * This function provides the same memory-ordering guarantees that
+ * would be provided by a synchronize_rcu() that was invoked at the call
+ * to the function that provided @oldstate, and that returned at the end
+ * of this function.
*/
bool poll_state_synchronize_rcu(unsigned long oldstate)
{
@@ -3874,7 +3842,7 @@ EXPORT_SYMBOL_GPL(poll_state_synchronize_rcu);
/**
* cond_synchronize_rcu - Conditionally wait for an RCU grace period
*
- * @oldstate: return value from earlier call to get_state_synchronize_rcu()
+ * @oldstate: value from get_state_synchronize_rcu() or start_poll_synchronize_rcu()
*
* If a full RCU grace period has elapsed since the earlier call to
* get_state_synchronize_rcu() or start_poll_synchronize_rcu(), just return.
@@ -3884,6 +3852,11 @@ EXPORT_SYMBOL_GPL(poll_state_synchronize_rcu);
* counter wrap is harmless. If the counter wraps, we have waited for
* more than 2 billion grace periods (and way more on a 64-bit system!),
* so waiting for one additional grace period should be just fine.
+ *
+ * This function provides the same memory-ordering guarantees that
+ * would be provided by a synchronize_rcu() that was invoked at the call
+ * to the function that provided @oldstate, and that returned at the end
+ * of this function.
*/
void cond_synchronize_rcu(unsigned long oldstate)
{
@@ -3911,7 +3884,7 @@ static int rcu_pending(int user)
check_cpu_stall(rdp);
/* Does this CPU need a deferred NOCB wakeup? */
- if (rcu_nocb_need_deferred_wakeup(rdp))
+ if (rcu_nocb_need_deferred_wakeup(rdp, RCU_NOCB_WAKE))
return 1;
/* Is this a nohz_full CPU in userspace or idle? (Ignore RCU if so.) */
@@ -4094,7 +4067,7 @@ EXPORT_SYMBOL_GPL(rcu_barrier);
/*
* Propagate ->qsinitmask bits up the rcu_node tree to account for the
* first CPU in a given leaf rcu_node structure coming online. The caller
- * must hold the corresponding leaf rcu_node ->lock with interrrupts
+ * must hold the corresponding leaf rcu_node ->lock with interrupts
* disabled.
*/
static void rcu_init_new_rnp(struct rcu_node *rnp_leaf)
@@ -4189,7 +4162,7 @@ int rcutree_prepare_cpu(unsigned int cpu)
rdp->rcu_iw_gp_seq = rdp->gp_seq - 1;
trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("cpuonl"));
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
- rcu_prepare_kthreads(cpu);
+ rcu_spawn_one_boost_kthread(rnp);
rcu_spawn_cpu_nocb_kthread(cpu);
WRITE_ONCE(rcu_state.n_online_cpus, rcu_state.n_online_cpus + 1);
@@ -4472,6 +4445,7 @@ static int __init rcu_spawn_gp_kthread(void)
wake_up_process(t);
rcu_spawn_nocb_kthreads();
rcu_spawn_boost_kthreads();
+ rcu_spawn_core_kthreads();
return 0;
}
early_initcall(rcu_spawn_gp_kthread);
@@ -4582,11 +4556,25 @@ static void __init rcu_init_one(void)
* replace the definitions in tree.h because those are needed to size
* the ->node array in the rcu_state structure.
*/
-static void __init rcu_init_geometry(void)
+void rcu_init_geometry(void)
{
ulong d;
int i;
+ static unsigned long old_nr_cpu_ids;
int rcu_capacity[RCU_NUM_LVLS];
+ static bool initialized;
+
+ if (initialized) {
+ /*
+ * Warn if setup_nr_cpu_ids() had not yet been invoked,
+ * unless nr_cpus_ids == NR_CPUS, in which case who cares?
+ */
+ WARN_ON_ONCE(old_nr_cpu_ids != nr_cpu_ids);
+ return;
+ }
+
+ old_nr_cpu_ids = nr_cpu_ids;
+ initialized = true;
/*
* Initialize any unspecified boot parameters.
@@ -4687,6 +4675,18 @@ static void __init kfree_rcu_batch_init(void)
int cpu;
int i;
+ /* Clamp it to [0:100] seconds interval. */
+ if (rcu_delay_page_cache_fill_msec < 0 ||
+ rcu_delay_page_cache_fill_msec > 100 * MSEC_PER_SEC) {
+
+ rcu_delay_page_cache_fill_msec =
+ clamp(rcu_delay_page_cache_fill_msec, 0,
+ (int) (100 * MSEC_PER_SEC));
+
+ pr_info("Adjusting rcutree.rcu_delay_page_cache_fill_msec to %d ms.\n",
+ rcu_delay_page_cache_fill_msec);
+ }
+
for_each_possible_cpu(cpu) {
struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
@@ -4696,7 +4696,7 @@ static void __init kfree_rcu_batch_init(void)
}
INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor);
- INIT_WORK(&krcp->page_cache_work, fill_page_cache_func);
+ INIT_DELAYED_WORK(&krcp->page_cache_work, fill_page_cache_func);
krcp->initialized = true;
}
if (register_shrinker(&kfree_rcu_shrinker))
@@ -4730,12 +4730,11 @@ void __init rcu_init(void)
rcutree_online_cpu(cpu);
}
- /* Create workqueue for expedited GPs and for Tree SRCU. */
+ /* Create workqueue for Tree SRCU and for expedited GPs. */
rcu_gp_wq = alloc_workqueue("rcu_gp", WQ_MEM_RECLAIM, 0);
WARN_ON(!rcu_gp_wq);
rcu_par_gp_wq = alloc_workqueue("rcu_par_gp", WQ_MEM_RECLAIM, 0);
WARN_ON(!rcu_par_gp_wq);
- srcu_init();
/* Fill in default value for rcutree.qovld boot parameter. */
/* -After- the rcu_node ->lock fields are initialized! */
@@ -4747,4 +4746,5 @@ void __init rcu_init(void)
#include "tree_stall.h"
#include "tree_exp.h"
+#include "tree_nocb.h"
#include "tree_plugin.h"
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 71821d59d95c5..ce611da2ff6b3 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -115,6 +115,7 @@ struct rcu_node {
/* boosting for this rcu_node structure. */
unsigned int boost_kthread_status;
/* State of boost_kthread_task for tracing. */
+ unsigned long n_boosts; /* Number of boosts for this rcu_node structure. */
#ifdef CONFIG_RCU_NOCB_CPU
struct swait_queue_head nocb_gp_wq[2];
/* Place for rcu_nocb_kthread() to wait GP. */
@@ -153,7 +154,7 @@ struct rcu_data {
unsigned long gp_seq; /* Track rsp->gp_seq counter. */
unsigned long gp_seq_needed; /* Track furthest future GP request. */
union rcu_noqs cpu_no_qs; /* No QSes yet for this CPU. */
- bool core_needs_qs; /* Core waits for quiesc state. */
+ bool core_needs_qs; /* Core waits for quiescent state. */
bool beenonline; /* CPU online at least once. */
bool gpwrap; /* Possible ->gp_seq wrap. */
bool exp_deferred_qs; /* This CPU awaiting a deferred QS? */
@@ -183,7 +184,7 @@ struct rcu_data {
int dynticks_snap; /* Per-GP tracking for dynticks. */
long dynticks_nesting; /* Track process nesting level. */
long dynticks_nmi_nesting; /* Track irq/NMI nesting level. */
- atomic_t dynticks; /* Even value for idle, else odd. */
+ unsigned long dynticks; /* Even value for idle, else odd. */
bool rcu_need_heavy_qs; /* GP old, so heavy quiescent state! */
bool rcu_urgent_qs; /* GP old need light quiescent state. */
bool rcu_forced_tick; /* Forced tick to provide QS. */
@@ -218,7 +219,6 @@ struct rcu_data {
/* The following fields are used by GP kthread, hence own cacheline. */
raw_spinlock_t nocb_gp_lock ____cacheline_internodealigned_in_smp;
- struct timer_list nocb_bypass_timer; /* Force nocb_bypass flush. */
u8 nocb_gp_sleep; /* Is the nocb GP thread asleep? */
u8 nocb_gp_bypass; /* Found a bypass on last scan? */
u8 nocb_gp_gp; /* GP to wait for on last scan? */
@@ -257,10 +257,10 @@ struct rcu_data {
};
/* Values for nocb_defer_wakeup field in struct rcu_data. */
-#define RCU_NOCB_WAKE_OFF -1
#define RCU_NOCB_WAKE_NOT 0
-#define RCU_NOCB_WAKE 1
-#define RCU_NOCB_WAKE_FORCE 2
+#define RCU_NOCB_WAKE_BYPASS 1
+#define RCU_NOCB_WAKE 2
+#define RCU_NOCB_WAKE_FORCE 3
#define RCU_JIFFIES_TILL_FORCE_QS (1 + (HZ > 250) + (HZ > 500))
/* For jiffies_till_first_fqs and */
@@ -417,8 +417,8 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
static bool rcu_is_callbacks_kthread(void);
static void rcu_cpu_kthread_setup(unsigned int cpu);
+static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp);
static void __init rcu_spawn_boost_kthreads(void);
-static void rcu_prepare_kthreads(int cpu);
static void rcu_cleanup_after_idle(void);
static void rcu_prepare_for_idle(void);
static bool rcu_preempt_has_tasks(struct rcu_node *rnp);
@@ -434,7 +434,7 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
bool *was_alldone, unsigned long flags);
static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_empty,
unsigned long flags);
-static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp);
+static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp, int level);
static bool do_nocb_deferred_wakeup(struct rcu_data *rdp);
static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp);
static void rcu_spawn_cpu_nocb_kthread(int cpu);
diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
new file mode 100644
index 0000000000000..bf2690ca5d2b6
--- /dev/null
+++ b/kernel/rcu/tree_nocb.h
@@ -0,0 +1,1496 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * Read-Copy Update mechanism for mutual exclusion (tree-based version)
+ * Internal non-public definitions that provide either classic
+ * or preemptible semantics.
+ *
+ * Copyright Red Hat, 2009
+ * Copyright IBM Corporation, 2009
+ * Copyright SUSE, 2021
+ *
+ * Author: Ingo Molnar <mingo@elte.hu>
+ * Paul E. McKenney <paulmck@linux.ibm.com>
+ * Frederic Weisbecker <frederic@kernel.org>
+ */
+
+#ifdef CONFIG_RCU_NOCB_CPU
+static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */
+static bool __read_mostly rcu_nocb_poll; /* Offload kthread are to poll. */
+static inline int rcu_lockdep_is_held_nocb(struct rcu_data *rdp)
+{
+ return lockdep_is_held(&rdp->nocb_lock);
+}
+
+static inline bool rcu_current_is_nocb_kthread(struct rcu_data *rdp)
+{
+ /* Race on early boot between thread creation and assignment */
+ if (!rdp->nocb_cb_kthread || !rdp->nocb_gp_kthread)
+ return true;
+
+ if (current == rdp->nocb_cb_kthread || current == rdp->nocb_gp_kthread)
+ if (in_task())
+ return true;
+ return false;
+}
+
+/*
+ * Offload callback processing from the boot-time-specified set of CPUs
+ * specified by rcu_nocb_mask. For the CPUs in the set, there are kthreads
+ * created that pull the callbacks from the corresponding CPU, wait for
+ * a grace period to elapse, and invoke the callbacks. These kthreads
+ * are organized into GP kthreads, which manage incoming callbacks, wait for
+ * grace periods, and awaken CB kthreads, and the CB kthreads, which only
+ * invoke callbacks. Each GP kthread invokes its own CBs. The no-CBs CPUs
+ * do a wake_up() on their GP kthread when they insert a callback into any
+ * empty list, unless the rcu_nocb_poll boot parameter has been specified,
+ * in which case each kthread actively polls its CPU. (Which isn't so great
+ * for energy efficiency, but which does reduce RCU's overhead on that CPU.)
+ *
+ * This is intended to be used in conjunction with Frederic Weisbecker's
+ * adaptive-idle work, which would seriously reduce OS jitter on CPUs
+ * running CPU-bound user-mode computations.
+ *
+ * Offloading of callbacks can also be used as an energy-efficiency
+ * measure because CPUs with no RCU callbacks queued are more aggressive
+ * about entering dyntick-idle mode.
+ */
+
+
+/*
+ * Parse the boot-time rcu_nocb_mask CPU list from the kernel parameters.
+ * If the list is invalid, a warning is emitted and all CPUs are offloaded.
+ */
+static int __init rcu_nocb_setup(char *str)
+{
+ alloc_bootmem_cpumask_var(&rcu_nocb_mask);
+ if (cpulist_parse(str, rcu_nocb_mask)) {
+ pr_warn("rcu_nocbs= bad CPU range, all CPUs set\n");
+ cpumask_setall(rcu_nocb_mask);
+ }
+ return 1;
+}
+__setup("rcu_nocbs=", rcu_nocb_setup);
+
+static int __init parse_rcu_nocb_poll(char *arg)
+{
+ rcu_nocb_poll = true;
+ return 0;
+}
+early_param("rcu_nocb_poll", parse_rcu_nocb_poll);
+
+/*
+ * Don't bother bypassing ->cblist if the call_rcu() rate is low.
+ * After all, the main point of bypassing is to avoid lock contention
+ * on ->nocb_lock, which only can happen at high call_rcu() rates.
+ */
+static int nocb_nobypass_lim_per_jiffy = 16 * 1000 / HZ;
+module_param(nocb_nobypass_lim_per_jiffy, int, 0);
+
+/*
+ * Acquire the specified rcu_data structure's ->nocb_bypass_lock. If the
+ * lock isn't immediately available, increment ->nocb_lock_contended to
+ * flag the contention.
+ */
+static void rcu_nocb_bypass_lock(struct rcu_data *rdp)
+ __acquires(&rdp->nocb_bypass_lock)
+{
+ lockdep_assert_irqs_disabled();
+ if (raw_spin_trylock(&rdp->nocb_bypass_lock))
+ return;
+ atomic_inc(&rdp->nocb_lock_contended);
+ WARN_ON_ONCE(smp_processor_id() != rdp->cpu);
+ smp_mb__after_atomic(); /* atomic_inc() before lock. */
+ raw_spin_lock(&rdp->nocb_bypass_lock);
+ smp_mb__before_atomic(); /* atomic_dec() after lock. */
+ atomic_dec(&rdp->nocb_lock_contended);
+}
+
+/*
+ * Spinwait until the specified rcu_data structure's ->nocb_lock is
+ * not contended. Please note that this is extremely special-purpose,
+ * relying on the fact that at most two kthreads and one CPU contend for
+ * this lock, and also that the two kthreads are guaranteed to have frequent
+ * grace-period-duration time intervals between successive acquisitions
+ * of the lock. This allows us to use an extremely simple throttling
+ * mechanism, and further to apply it only to the CPU doing floods of
+ * call_rcu() invocations. Don't try this at home!
+ */
+static void rcu_nocb_wait_contended(struct rcu_data *rdp)
+{
+ WARN_ON_ONCE(smp_processor_id() != rdp->cpu);
+ while (WARN_ON_ONCE(atomic_read(&rdp->nocb_lock_contended)))
+ cpu_relax();
+}
+
+/*
+ * Conditionally acquire the specified rcu_data structure's
+ * ->nocb_bypass_lock.
+ */
+static bool rcu_nocb_bypass_trylock(struct rcu_data *rdp)
+{
+ lockdep_assert_irqs_disabled();
+ return raw_spin_trylock(&rdp->nocb_bypass_lock);
+}
+
+/*
+ * Release the specified rcu_data structure's ->nocb_bypass_lock.
+ */
+static void rcu_nocb_bypass_unlock(struct rcu_data *rdp)
+ __releases(&rdp->nocb_bypass_lock)
+{
+ lockdep_assert_irqs_disabled();
+ raw_spin_unlock(&rdp->nocb_bypass_lock);
+}
+
+/*
+ * Acquire the specified rcu_data structure's ->nocb_lock, but only
+ * if it corresponds to a no-CBs CPU.
+ */
+static void rcu_nocb_lock(struct rcu_data *rdp)
+{
+ lockdep_assert_irqs_disabled();
+ if (!rcu_rdp_is_offloaded(rdp))
+ return;
+ raw_spin_lock(&rdp->nocb_lock);
+}
+
+/*
+ * Release the specified rcu_data structure's ->nocb_lock, but only
+ * if it corresponds to a no-CBs CPU.
+ */
+static void rcu_nocb_unlock(struct rcu_data *rdp)
+{
+ if (rcu_rdp_is_offloaded(rdp)) {
+ lockdep_assert_irqs_disabled();
+ raw_spin_unlock(&rdp->nocb_lock);
+ }
+}
+
+/*
+ * Release the specified rcu_data structure's ->nocb_lock and restore
+ * interrupts, but only if it corresponds to a no-CBs CPU.
+ */
+static void rcu_nocb_unlock_irqrestore(struct rcu_data *rdp,
+ unsigned long flags)
+{
+ if (rcu_rdp_is_offloaded(rdp)) {
+ lockdep_assert_irqs_disabled();
+ raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
+ } else {
+ local_irq_restore(flags);
+ }
+}
+
+/* Lockdep check that ->cblist may be safely accessed. */
+static void rcu_lockdep_assert_cblist_protected(struct rcu_data *rdp)
+{
+ lockdep_assert_irqs_disabled();
+ if (rcu_rdp_is_offloaded(rdp))
+ lockdep_assert_held(&rdp->nocb_lock);
+}
+
+/*
+ * Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended
+ * grace period.
+ */
+static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq)
+{
+ swake_up_all(sq);
+}
+
+static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp)
+{
+ return &rnp->nocb_gp_wq[rcu_seq_ctr(rnp->gp_seq) & 0x1];
+}
+
+static void rcu_init_one_nocb(struct rcu_node *rnp)
+{
+ init_swait_queue_head(&rnp->nocb_gp_wq[0]);
+ init_swait_queue_head(&rnp->nocb_gp_wq[1]);
+}
+
+/* Is the specified CPU a no-CBs CPU? */
+bool rcu_is_nocb_cpu(int cpu)
+{
+ if (cpumask_available(rcu_nocb_mask))
+ return cpumask_test_cpu(cpu, rcu_nocb_mask);
+ return false;
+}
+
+static bool __wake_nocb_gp(struct rcu_data *rdp_gp,
+ struct rcu_data *rdp,
+ bool force, unsigned long flags)
+ __releases(rdp_gp->nocb_gp_lock)
+{
+ bool needwake = false;
+
+ if (!READ_ONCE(rdp_gp->nocb_gp_kthread)) {
+ raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags);
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
+ TPS("AlreadyAwake"));
+ return false;
+ }
+
+ if (rdp_gp->nocb_defer_wakeup > RCU_NOCB_WAKE_NOT) {
+ WRITE_ONCE(rdp_gp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT);
+ del_timer(&rdp_gp->nocb_timer);
+ }
+
+ if (force || READ_ONCE(rdp_gp->nocb_gp_sleep)) {
+ WRITE_ONCE(rdp_gp->nocb_gp_sleep, false);
+ needwake = true;
+ }
+ raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags);
+ if (needwake) {
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DoWake"));
+ wake_up_process(rdp_gp->nocb_gp_kthread);
+ }
+
+ return needwake;
+}
+
+/*
+ * Kick the GP kthread for this NOCB group.
+ */
+static bool wake_nocb_gp(struct rcu_data *rdp, bool force)
+{
+ unsigned long flags;
+ struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
+
+ raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags);
+ return __wake_nocb_gp(rdp_gp, rdp, force, flags);
+}
+
+/*
+ * Arrange to wake the GP kthread for this NOCB group at some future
+ * time when it is safe to do so.
+ */
+static void wake_nocb_gp_defer(struct rcu_data *rdp, int waketype,
+ const char *reason)
+{
+ unsigned long flags;
+ struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
+
+ raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags);
+
+ /*
+ * Bypass wakeup overrides previous deferments. In case
+ * of callback storm, no need to wake up too early.
+ */
+ if (waketype == RCU_NOCB_WAKE_BYPASS) {
+ mod_timer(&rdp_gp->nocb_timer, jiffies + 2);
+ WRITE_ONCE(rdp_gp->nocb_defer_wakeup, waketype);
+ } else {
+ if (rdp_gp->nocb_defer_wakeup < RCU_NOCB_WAKE)
+ mod_timer(&rdp_gp->nocb_timer, jiffies + 1);
+ if (rdp_gp->nocb_defer_wakeup < waketype)
+ WRITE_ONCE(rdp_gp->nocb_defer_wakeup, waketype);
+ }
+
+ raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags);
+
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, reason);
+}
+
+/*
+ * Flush the ->nocb_bypass queue into ->cblist, enqueuing rhp if non-NULL.
+ * However, if there is a callback to be enqueued and if ->nocb_bypass
+ * proves to be initially empty, just return false because the no-CB GP
+ * kthread may need to be awakened in this case.
+ *
+ * Note that this function always returns true if rhp is NULL.
+ */
+static bool rcu_nocb_do_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
+ unsigned long j)
+{
+ struct rcu_cblist rcl;
+
+ WARN_ON_ONCE(!rcu_rdp_is_offloaded(rdp));
+ rcu_lockdep_assert_cblist_protected(rdp);
+ lockdep_assert_held(&rdp->nocb_bypass_lock);
+ if (rhp && !rcu_cblist_n_cbs(&rdp->nocb_bypass)) {
+ raw_spin_unlock(&rdp->nocb_bypass_lock);
+ return false;
+ }
+ /* Note: ->cblist.len already accounts for ->nocb_bypass contents. */
+ if (rhp)
+ rcu_segcblist_inc_len(&rdp->cblist); /* Must precede enqueue. */
+ rcu_cblist_flush_enqueue(&rcl, &rdp->nocb_bypass, rhp);
+ rcu_segcblist_insert_pend_cbs(&rdp->cblist, &rcl);
+ WRITE_ONCE(rdp->nocb_bypass_first, j);
+ rcu_nocb_bypass_unlock(rdp);
+ return true;
+}
+
+/*
+ * Flush the ->nocb_bypass queue into ->cblist, enqueuing rhp if non-NULL.
+ * However, if there is a callback to be enqueued and if ->nocb_bypass
+ * proves to be initially empty, just return false because the no-CB GP
+ * kthread may need to be awakened in this case.
+ *
+ * Note that this function always returns true if rhp is NULL.
+ */
+static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
+ unsigned long j)
+{
+ if (!rcu_rdp_is_offloaded(rdp))
+ return true;
+ rcu_lockdep_assert_cblist_protected(rdp);
+ rcu_nocb_bypass_lock(rdp);
+ return rcu_nocb_do_flush_bypass(rdp, rhp, j);
+}
+
+/*
+ * If the ->nocb_bypass_lock is immediately available, flush the
+ * ->nocb_bypass queue into ->cblist.
+ */
+static void rcu_nocb_try_flush_bypass(struct rcu_data *rdp, unsigned long j)
+{
+ rcu_lockdep_assert_cblist_protected(rdp);
+ if (!rcu_rdp_is_offloaded(rdp) ||
+ !rcu_nocb_bypass_trylock(rdp))
+ return;
+ WARN_ON_ONCE(!rcu_nocb_do_flush_bypass(rdp, NULL, j));
+}
+
+/*
+ * See whether it is appropriate to use the ->nocb_bypass list in order
+ * to control contention on ->nocb_lock. A limited number of direct
+ * enqueues are permitted into ->cblist per jiffy. If ->nocb_bypass
+ * is non-empty, further callbacks must be placed into ->nocb_bypass,
+ * otherwise rcu_barrier() breaks. Use rcu_nocb_flush_bypass() to switch
+ * back to direct use of ->cblist. However, ->nocb_bypass should not be
+ * used if ->cblist is empty, because otherwise callbacks can be stranded
+ * on ->nocb_bypass because we cannot count on the current CPU ever again
+ * invoking call_rcu(). The general rule is that if ->nocb_bypass is
+ * non-empty, the corresponding no-CBs grace-period kthread must not be
+ * in an indefinite sleep state.
+ *
+ * Finally, it is not permitted to use the bypass during early boot,
+ * as doing so would confuse the auto-initialization code. Besides
+ * which, there is no point in worrying about lock contention while
+ * there is only one CPU in operation.
+ */
+static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
+ bool *was_alldone, unsigned long flags)
+{
+ unsigned long c;
+ unsigned long cur_gp_seq;
+ unsigned long j = jiffies;
+ long ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
+
+ lockdep_assert_irqs_disabled();
+
+ // Pure softirq/rcuc based processing: no bypassing, no
+ // locking.
+ if (!rcu_rdp_is_offloaded(rdp)) {
+ *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
+ return false;
+ }
+
+ // In the process of (de-)offloading: no bypassing, but
+ // locking.
+ if (!rcu_segcblist_completely_offloaded(&rdp->cblist)) {
+ rcu_nocb_lock(rdp);
+ *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
+ return false; /* Not offloaded, no bypassing. */
+ }
+
+ // Don't use ->nocb_bypass during early boot.
+ if (rcu_scheduler_active != RCU_SCHEDULER_RUNNING) {
+ rcu_nocb_lock(rdp);
+ WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass));
+ *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
+ return false;
+ }
+
+ // If we have advanced to a new jiffy, reset counts to allow
+ // moving back from ->nocb_bypass to ->cblist.
+ if (j == rdp->nocb_nobypass_last) {
+ c = rdp->nocb_nobypass_count + 1;
+ } else {
+ WRITE_ONCE(rdp->nocb_nobypass_last, j);
+ c = rdp->nocb_nobypass_count - nocb_nobypass_lim_per_jiffy;
+ if (ULONG_CMP_LT(rdp->nocb_nobypass_count,
+ nocb_nobypass_lim_per_jiffy))
+ c = 0;
+ else if (c > nocb_nobypass_lim_per_jiffy)
+ c = nocb_nobypass_lim_per_jiffy;
+ }
+ WRITE_ONCE(rdp->nocb_nobypass_count, c);
+
+ // If there hasn't yet been all that many ->cblist enqueues
+ // this jiffy, tell the caller to enqueue onto ->cblist. But flush
+ // ->nocb_bypass first.
+ if (rdp->nocb_nobypass_count < nocb_nobypass_lim_per_jiffy) {
+ rcu_nocb_lock(rdp);
+ *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
+ if (*was_alldone)
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
+ TPS("FirstQ"));
+ WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, j));
+ WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass));
+ return false; // Caller must enqueue the callback.
+ }
+
+ // If ->nocb_bypass has been used too long or is too full,
+ // flush ->nocb_bypass to ->cblist.
+ if ((ncbs && j != READ_ONCE(rdp->nocb_bypass_first)) ||
+ ncbs >= qhimark) {
+ rcu_nocb_lock(rdp);
+ if (!rcu_nocb_flush_bypass(rdp, rhp, j)) {
+ *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
+ if (*was_alldone)
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
+ TPS("FirstQ"));
+ WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass));
+ return false; // Caller must enqueue the callback.
+ }
+ if (j != rdp->nocb_gp_adv_time &&
+ rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) &&
+ rcu_seq_done(&rdp->mynode->gp_seq, cur_gp_seq)) {
+ rcu_advance_cbs_nowake(rdp->mynode, rdp);
+ rdp->nocb_gp_adv_time = j;
+ }
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ return true; // Callback already enqueued.
+ }
+
+ // We need to use the bypass.
+ rcu_nocb_wait_contended(rdp);
+ rcu_nocb_bypass_lock(rdp);
+ ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
+ rcu_segcblist_inc_len(&rdp->cblist); /* Must precede enqueue. */
+ rcu_cblist_enqueue(&rdp->nocb_bypass, rhp);
+ if (!ncbs) {
+ WRITE_ONCE(rdp->nocb_bypass_first, j);
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("FirstBQ"));
+ }
+ rcu_nocb_bypass_unlock(rdp);
+ smp_mb(); /* Order enqueue before wake. */
+ if (ncbs) {
+ local_irq_restore(flags);
+ } else {
+ // No-CBs GP kthread might be indefinitely asleep, if so, wake.
+ rcu_nocb_lock(rdp); // Rare during call_rcu() flood.
+ if (!rcu_segcblist_pend_cbs(&rdp->cblist)) {
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
+ TPS("FirstBQwake"));
+ __call_rcu_nocb_wake(rdp, true, flags);
+ } else {
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
+ TPS("FirstBQnoWake"));
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ }
+ }
+ return true; // Callback already enqueued.
+}
+
+/*
+ * Awaken the no-CBs grace-period kthread if needed, either due to it
+ * legitimately being asleep or due to overload conditions.
+ *
+ * If warranted, also wake up the kthread servicing this CPUs queues.
+ */
+static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone,
+ unsigned long flags)
+ __releases(rdp->nocb_lock)
+{
+ unsigned long cur_gp_seq;
+ unsigned long j;
+ long len;
+ struct task_struct *t;
+
+ // If we are being polled or there is no kthread, just leave.
+ t = READ_ONCE(rdp->nocb_gp_kthread);
+ if (rcu_nocb_poll || !t) {
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
+ TPS("WakeNotPoll"));
+ return;
+ }
+ // Need to actually to a wakeup.
+ len = rcu_segcblist_n_cbs(&rdp->cblist);
+ if (was_alldone) {
+ rdp->qlen_last_fqs_check = len;
+ if (!irqs_disabled_flags(flags)) {
+ /* ... if queue was empty ... */
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ wake_nocb_gp(rdp, false);
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
+ TPS("WakeEmpty"));
+ } else {
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE,
+ TPS("WakeEmptyIsDeferred"));
+ }
+ } else if (len > rdp->qlen_last_fqs_check + qhimark) {
+ /* ... or if many callbacks queued. */
+ rdp->qlen_last_fqs_check = len;
+ j = jiffies;
+ if (j != rdp->nocb_gp_adv_time &&
+ rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) &&
+ rcu_seq_done(&rdp->mynode->gp_seq, cur_gp_seq)) {
+ rcu_advance_cbs_nowake(rdp->mynode, rdp);
+ rdp->nocb_gp_adv_time = j;
+ }
+ smp_mb(); /* Enqueue before timer_pending(). */
+ if ((rdp->nocb_cb_sleep ||
+ !rcu_segcblist_ready_cbs(&rdp->cblist)) &&
+ !timer_pending(&rdp->nocb_timer)) {
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE_FORCE,
+ TPS("WakeOvfIsDeferred"));
+ } else {
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNot"));
+ }
+ } else {
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNot"));
+ }
+ return;
+}
+
+/*
+ * Check if we ignore this rdp.
+ *
+ * We check that without holding the nocb lock but
+ * we make sure not to miss a freshly offloaded rdp
+ * with the current ordering:
+ *
+ * rdp_offload_toggle() nocb_gp_enabled_cb()
+ * ------------------------- ----------------------------
+ * WRITE flags LOCK nocb_gp_lock
+ * LOCK nocb_gp_lock READ/WRITE nocb_gp_sleep
+ * READ/WRITE nocb_gp_sleep UNLOCK nocb_gp_lock
+ * UNLOCK nocb_gp_lock READ flags
+ */
+static inline bool nocb_gp_enabled_cb(struct rcu_data *rdp)
+{
+ u8 flags = SEGCBLIST_OFFLOADED | SEGCBLIST_KTHREAD_GP;
+
+ return rcu_segcblist_test_flags(&rdp->cblist, flags);
+}
+
+static inline bool nocb_gp_update_state_deoffloading(struct rcu_data *rdp,
+ bool *needwake_state)
+{
+ struct rcu_segcblist *cblist = &rdp->cblist;
+
+ if (rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED)) {
+ if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)) {
+ rcu_segcblist_set_flags(cblist, SEGCBLIST_KTHREAD_GP);
+ if (rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB))
+ *needwake_state = true;
+ }
+ return false;
+ }
+
+ /*
+ * De-offloading. Clear our flag and notify the de-offload worker.
+ * We will ignore this rdp until it ever gets re-offloaded.
+ */
+ WARN_ON_ONCE(!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP));
+ rcu_segcblist_clear_flags(cblist, SEGCBLIST_KTHREAD_GP);
+ if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB))
+ *needwake_state = true;
+ return true;
+}
+
+
+/*
+ * No-CBs GP kthreads come here to wait for additional callbacks to show up
+ * or for grace periods to end.
+ */
+static void nocb_gp_wait(struct rcu_data *my_rdp)
+{
+ bool bypass = false;
+ long bypass_ncbs;
+ int __maybe_unused cpu = my_rdp->cpu;
+ unsigned long cur_gp_seq;
+ unsigned long flags;
+ bool gotcbs = false;
+ unsigned long j = jiffies;
+ bool needwait_gp = false; // This prevents actual uninitialized use.
+ bool needwake;
+ bool needwake_gp;
+ struct rcu_data *rdp;
+ struct rcu_node *rnp;
+ unsigned long wait_gp_seq = 0; // Suppress "use uninitialized" warning.
+ bool wasempty = false;
+
+ /*
+ * Each pass through the following loop checks for CBs and for the
+ * nearest grace period (if any) to wait for next. The CB kthreads
+ * and the global grace-period kthread are awakened if needed.
+ */
+ WARN_ON_ONCE(my_rdp->nocb_gp_rdp != my_rdp);
+ for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_cb_rdp) {
+ bool needwake_state = false;
+
+ if (!nocb_gp_enabled_cb(rdp))
+ continue;
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Check"));
+ rcu_nocb_lock_irqsave(rdp, flags);
+ if (nocb_gp_update_state_deoffloading(rdp, &needwake_state)) {
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ if (needwake_state)
+ swake_up_one(&rdp->nocb_state_wq);
+ continue;
+ }
+ bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
+ if (bypass_ncbs &&
+ (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + 1) ||
+ bypass_ncbs > 2 * qhimark)) {
+ // Bypass full or old, so flush it.
+ (void)rcu_nocb_try_flush_bypass(rdp, j);
+ bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
+ } else if (!bypass_ncbs && rcu_segcblist_empty(&rdp->cblist)) {
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ if (needwake_state)
+ swake_up_one(&rdp->nocb_state_wq);
+ continue; /* No callbacks here, try next. */
+ }
+ if (bypass_ncbs) {
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
+ TPS("Bypass"));
+ bypass = true;
+ }
+ rnp = rdp->mynode;
+
+ // Advance callbacks if helpful and low contention.
+ needwake_gp = false;
+ if (!rcu_segcblist_restempty(&rdp->cblist,
+ RCU_NEXT_READY_TAIL) ||
+ (rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) &&
+ rcu_seq_done(&rnp->gp_seq, cur_gp_seq))) {
+ raw_spin_lock_rcu_node(rnp); /* irqs disabled. */
+ needwake_gp = rcu_advance_cbs(rnp, rdp);
+ wasempty = rcu_segcblist_restempty(&rdp->cblist,
+ RCU_NEXT_READY_TAIL);
+ raw_spin_unlock_rcu_node(rnp); /* irqs disabled. */
+ }
+ // Need to wait on some grace period?
+ WARN_ON_ONCE(wasempty &&
+ !rcu_segcblist_restempty(&rdp->cblist,
+ RCU_NEXT_READY_TAIL));
+ if (rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq)) {
+ if (!needwait_gp ||
+ ULONG_CMP_LT(cur_gp_seq, wait_gp_seq))
+ wait_gp_seq = cur_gp_seq;
+ needwait_gp = true;
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
+ TPS("NeedWaitGP"));
+ }
+ if (rcu_segcblist_ready_cbs(&rdp->cblist)) {
+ needwake = rdp->nocb_cb_sleep;
+ WRITE_ONCE(rdp->nocb_cb_sleep, false);
+ smp_mb(); /* CB invocation -after- GP end. */
+ } else {
+ needwake = false;
+ }
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ if (needwake) {
+ swake_up_one(&rdp->nocb_cb_wq);
+ gotcbs = true;
+ }
+ if (needwake_gp)
+ rcu_gp_kthread_wake();
+ if (needwake_state)
+ swake_up_one(&rdp->nocb_state_wq);
+ }
+
+ my_rdp->nocb_gp_bypass = bypass;
+ my_rdp->nocb_gp_gp = needwait_gp;
+ my_rdp->nocb_gp_seq = needwait_gp ? wait_gp_seq : 0;
+
+ if (bypass && !rcu_nocb_poll) {
+ // At least one child with non-empty ->nocb_bypass, so set
+ // timer in order to avoid stranding its callbacks.
+ wake_nocb_gp_defer(my_rdp, RCU_NOCB_WAKE_BYPASS,
+ TPS("WakeBypassIsDeferred"));
+ }
+ if (rcu_nocb_poll) {
+ /* Polling, so trace if first poll in the series. */
+ if (gotcbs)
+ trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("Poll"));
+ schedule_timeout_idle(1);
+ } else if (!needwait_gp) {
+ /* Wait for callbacks to appear. */
+ trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("Sleep"));
+ swait_event_interruptible_exclusive(my_rdp->nocb_gp_wq,
+ !READ_ONCE(my_rdp->nocb_gp_sleep));
+ trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("EndSleep"));
+ } else {
+ rnp = my_rdp->mynode;
+ trace_rcu_this_gp(rnp, my_rdp, wait_gp_seq, TPS("StartWait"));
+ swait_event_interruptible_exclusive(
+ rnp->nocb_gp_wq[rcu_seq_ctr(wait_gp_seq) & 0x1],
+ rcu_seq_done(&rnp->gp_seq, wait_gp_seq) ||
+ !READ_ONCE(my_rdp->nocb_gp_sleep));
+ trace_rcu_this_gp(rnp, my_rdp, wait_gp_seq, TPS("EndWait"));
+ }
+ if (!rcu_nocb_poll) {
+ raw_spin_lock_irqsave(&my_rdp->nocb_gp_lock, flags);
+ if (my_rdp->nocb_defer_wakeup > RCU_NOCB_WAKE_NOT) {
+ WRITE_ONCE(my_rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT);
+ del_timer(&my_rdp->nocb_timer);
+ }
+ WRITE_ONCE(my_rdp->nocb_gp_sleep, true);
+ raw_spin_unlock_irqrestore(&my_rdp->nocb_gp_lock, flags);
+ }
+ my_rdp->nocb_gp_seq = -1;
+ WARN_ON(signal_pending(current));
+}
+
+/*
+ * No-CBs grace-period-wait kthread. There is one of these per group
+ * of CPUs, but only once at least one CPU in that group has come online
+ * at least once since boot. This kthread checks for newly posted
+ * callbacks from any of the CPUs it is responsible for, waits for a
+ * grace period, then awakens all of the rcu_nocb_cb_kthread() instances
+ * that then have callback-invocation work to do.
+ */
+static int rcu_nocb_gp_kthread(void *arg)
+{
+ struct rcu_data *rdp = arg;
+
+ for (;;) {
+ WRITE_ONCE(rdp->nocb_gp_loops, rdp->nocb_gp_loops + 1);
+ nocb_gp_wait(rdp);
+ cond_resched_tasks_rcu_qs();
+ }
+ return 0;
+}
+
+static inline bool nocb_cb_can_run(struct rcu_data *rdp)
+{
+ u8 flags = SEGCBLIST_OFFLOADED | SEGCBLIST_KTHREAD_CB;
+ return rcu_segcblist_test_flags(&rdp->cblist, flags);
+}
+
+static inline bool nocb_cb_wait_cond(struct rcu_data *rdp)
+{
+ return nocb_cb_can_run(rdp) && !READ_ONCE(rdp->nocb_cb_sleep);
+}
+
+/*
+ * Invoke any ready callbacks from the corresponding no-CBs CPU,
+ * then, if there are no more, wait for more to appear.
+ */
+static void nocb_cb_wait(struct rcu_data *rdp)
+{
+ struct rcu_segcblist *cblist = &rdp->cblist;
+ unsigned long cur_gp_seq;
+ unsigned long flags;
+ bool needwake_state = false;
+ bool needwake_gp = false;
+ bool can_sleep = true;
+ struct rcu_node *rnp = rdp->mynode;
+
+ local_irq_save(flags);
+ rcu_momentary_dyntick_idle();
+ local_irq_restore(flags);
+ /*
+ * Disable BH to provide the expected environment. Also, when
+ * transitioning to/from NOCB mode, a self-requeuing callback might
+ * be invoked from softirq. A short grace period could cause both
+ * instances of this callback would execute concurrently.
+ */
+ local_bh_disable();
+ rcu_do_batch(rdp);
+ local_bh_enable();
+ lockdep_assert_irqs_enabled();
+ rcu_nocb_lock_irqsave(rdp, flags);
+ if (rcu_segcblist_nextgp(cblist, &cur_gp_seq) &&
+ rcu_seq_done(&rnp->gp_seq, cur_gp_seq) &&
+ raw_spin_trylock_rcu_node(rnp)) { /* irqs already disabled. */
+ needwake_gp = rcu_advance_cbs(rdp->mynode, rdp);
+ raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */
+ }
+
+ if (rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED)) {
+ if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB)) {
+ rcu_segcblist_set_flags(cblist, SEGCBLIST_KTHREAD_CB);
+ if (rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP))
+ needwake_state = true;
+ }
+ if (rcu_segcblist_ready_cbs(cblist))
+ can_sleep = false;
+ } else {
+ /*
+ * De-offloading. Clear our flag and notify the de-offload worker.
+ * We won't touch the callbacks and keep sleeping until we ever
+ * get re-offloaded.
+ */
+ WARN_ON_ONCE(!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB));
+ rcu_segcblist_clear_flags(cblist, SEGCBLIST_KTHREAD_CB);
+ if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP))
+ needwake_state = true;
+ }
+
+ WRITE_ONCE(rdp->nocb_cb_sleep, can_sleep);
+
+ if (rdp->nocb_cb_sleep)
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("CBSleep"));
+
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ if (needwake_gp)
+ rcu_gp_kthread_wake();
+
+ if (needwake_state)
+ swake_up_one(&rdp->nocb_state_wq);
+
+ do {
+ swait_event_interruptible_exclusive(rdp->nocb_cb_wq,
+ nocb_cb_wait_cond(rdp));
+
+ // VVV Ensure CB invocation follows _sleep test.
+ if (smp_load_acquire(&rdp->nocb_cb_sleep)) { // ^^^
+ WARN_ON(signal_pending(current));
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeEmpty"));
+ }
+ } while (!nocb_cb_can_run(rdp));
+}
+
+/*
+ * Per-rcu_data kthread, but only for no-CBs CPUs. Repeatedly invoke
+ * nocb_cb_wait() to do the dirty work.
+ */
+static int rcu_nocb_cb_kthread(void *arg)
+{
+ struct rcu_data *rdp = arg;
+
+ // Each pass through this loop does one callback batch, and,
+ // if there are no more ready callbacks, waits for them.
+ for (;;) {
+ nocb_cb_wait(rdp);
+ cond_resched_tasks_rcu_qs();
+ }
+ return 0;
+}
+
+/* Is a deferred wakeup of rcu_nocb_kthread() required? */
+static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp, int level)
+{
+ return READ_ONCE(rdp->nocb_defer_wakeup) >= level;
+}
+
+/* Do a deferred wakeup of rcu_nocb_kthread(). */
+static bool do_nocb_deferred_wakeup_common(struct rcu_data *rdp_gp,
+ struct rcu_data *rdp, int level,
+ unsigned long flags)
+ __releases(rdp_gp->nocb_gp_lock)
+{
+ int ndw;
+ int ret;
+
+ if (!rcu_nocb_need_deferred_wakeup(rdp_gp, level)) {
+ raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags);
+ return false;
+ }
+
+ ndw = rdp_gp->nocb_defer_wakeup;
+ ret = __wake_nocb_gp(rdp_gp, rdp, ndw == RCU_NOCB_WAKE_FORCE, flags);
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DeferredWake"));
+
+ return ret;
+}
+
+/* Do a deferred wakeup of rcu_nocb_kthread() from a timer handler. */
+static void do_nocb_deferred_wakeup_timer(struct timer_list *t)
+{
+ unsigned long flags;
+ struct rcu_data *rdp = from_timer(rdp, t, nocb_timer);
+
+ WARN_ON_ONCE(rdp->nocb_gp_rdp != rdp);
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Timer"));
+
+ raw_spin_lock_irqsave(&rdp->nocb_gp_lock, flags);
+ smp_mb__after_spinlock(); /* Timer expire before wakeup. */
+ do_nocb_deferred_wakeup_common(rdp, rdp, RCU_NOCB_WAKE_BYPASS, flags);
+}
+
+/*
+ * Do a deferred wakeup of rcu_nocb_kthread() from fastpath.
+ * This means we do an inexact common-case check. Note that if
+ * we miss, ->nocb_timer will eventually clean things up.
+ */
+static bool do_nocb_deferred_wakeup(struct rcu_data *rdp)
+{
+ unsigned long flags;
+ struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
+
+ if (!rdp_gp || !rcu_nocb_need_deferred_wakeup(rdp_gp, RCU_NOCB_WAKE))
+ return false;
+
+ raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags);
+ return do_nocb_deferred_wakeup_common(rdp_gp, rdp, RCU_NOCB_WAKE, flags);
+}
+
+void rcu_nocb_flush_deferred_wakeup(void)
+{
+ do_nocb_deferred_wakeup(this_cpu_ptr(&rcu_data));
+}
+EXPORT_SYMBOL_GPL(rcu_nocb_flush_deferred_wakeup);
+
+static int rdp_offload_toggle(struct rcu_data *rdp,
+ bool offload, unsigned long flags)
+ __releases(rdp->nocb_lock)
+{
+ struct rcu_segcblist *cblist = &rdp->cblist;
+ struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
+ bool wake_gp = false;
+
+ rcu_segcblist_offload(cblist, offload);
+
+ if (rdp->nocb_cb_sleep)
+ rdp->nocb_cb_sleep = false;
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+
+ /*
+ * Ignore former value of nocb_cb_sleep and force wake up as it could
+ * have been spuriously set to false already.
+ */
+ swake_up_one(&rdp->nocb_cb_wq);
+
+ raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags);
+ if (rdp_gp->nocb_gp_sleep) {
+ rdp_gp->nocb_gp_sleep = false;
+ wake_gp = true;
+ }
+ raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags);
+
+ if (wake_gp)
+ wake_up_process(rdp_gp->nocb_gp_kthread);
+
+ return 0;
+}
+
+static long rcu_nocb_rdp_deoffload(void *arg)
+{
+ struct rcu_data *rdp = arg;
+ struct rcu_segcblist *cblist = &rdp->cblist;
+ unsigned long flags;
+ int ret;
+
+ WARN_ON_ONCE(rdp->cpu != raw_smp_processor_id());
+
+ pr_info("De-offloading %d\n", rdp->cpu);
+
+ rcu_nocb_lock_irqsave(rdp, flags);
+ /*
+ * Flush once and for all now. This suffices because we are
+ * running on the target CPU holding ->nocb_lock (thus having
+ * interrupts disabled), and because rdp_offload_toggle()
+ * invokes rcu_segcblist_offload(), which clears SEGCBLIST_OFFLOADED.
+ * Thus future calls to rcu_segcblist_completely_offloaded() will
+ * return false, which means that future calls to rcu_nocb_try_bypass()
+ * will refuse to put anything into the bypass.
+ */
+ WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies));
+ ret = rdp_offload_toggle(rdp, false, flags);
+ swait_event_exclusive(rdp->nocb_state_wq,
+ !rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB |
+ SEGCBLIST_KTHREAD_GP));
+ /*
+ * Lock one last time to acquire latest callback updates from kthreads
+ * so we can later handle callbacks locally without locking.
+ */
+ rcu_nocb_lock_irqsave(rdp, flags);
+ /*
+ * Theoretically we could set SEGCBLIST_SOFTIRQ_ONLY after the nocb
+ * lock is released but how about being paranoid for once?
+ */
+ rcu_segcblist_set_flags(cblist, SEGCBLIST_SOFTIRQ_ONLY);
+ /*
+ * With SEGCBLIST_SOFTIRQ_ONLY, we can't use
+ * rcu_nocb_unlock_irqrestore() anymore.
+ */
+ raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
+
+ /* Sanity check */
+ WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass));
+
+
+ return ret;
+}
+
+int rcu_nocb_cpu_deoffload(int cpu)
+{
+ struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+ int ret = 0;
+
+ mutex_lock(&rcu_state.barrier_mutex);
+ cpus_read_lock();
+ if (rcu_rdp_is_offloaded(rdp)) {
+ if (cpu_online(cpu)) {
+ ret = work_on_cpu(cpu, rcu_nocb_rdp_deoffload, rdp);
+ if (!ret)
+ cpumask_clear_cpu(cpu, rcu_nocb_mask);
+ } else {
+ pr_info("NOCB: Can't CB-deoffload an offline CPU\n");
+ ret = -EINVAL;
+ }
+ }
+ cpus_read_unlock();
+ mutex_unlock(&rcu_state.barrier_mutex);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(rcu_nocb_cpu_deoffload);
+
+static long rcu_nocb_rdp_offload(void *arg)
+{
+ struct rcu_data *rdp = arg;
+ struct rcu_segcblist *cblist = &rdp->cblist;
+ unsigned long flags;
+ int ret;
+
+ WARN_ON_ONCE(rdp->cpu != raw_smp_processor_id());
+ /*
+ * For now we only support re-offload, ie: the rdp must have been
+ * offloaded on boot first.
+ */
+ if (!rdp->nocb_gp_rdp)
+ return -EINVAL;
+
+ pr_info("Offloading %d\n", rdp->cpu);
+ /*
+ * Can't use rcu_nocb_lock_irqsave() while we are in
+ * SEGCBLIST_SOFTIRQ_ONLY mode.
+ */
+ raw_spin_lock_irqsave(&rdp->nocb_lock, flags);
+
+ /*
+ * We didn't take the nocb lock while working on the
+ * rdp->cblist in SEGCBLIST_SOFTIRQ_ONLY mode.
+ * Every modifications that have been done previously on
+ * rdp->cblist must be visible remotely by the nocb kthreads
+ * upon wake up after reading the cblist flags.
+ *
+ * The layout against nocb_lock enforces that ordering:
+ *
+ * __rcu_nocb_rdp_offload() nocb_cb_wait()/nocb_gp_wait()
+ * ------------------------- ----------------------------
+ * WRITE callbacks rcu_nocb_lock()
+ * rcu_nocb_lock() READ flags
+ * WRITE flags READ callbacks
+ * rcu_nocb_unlock() rcu_nocb_unlock()
+ */
+ ret = rdp_offload_toggle(rdp, true, flags);
+ swait_event_exclusive(rdp->nocb_state_wq,
+ rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB) &&
+ rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP));
+
+ return ret;
+}
+
+int rcu_nocb_cpu_offload(int cpu)
+{
+ struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+ int ret = 0;
+
+ mutex_lock(&rcu_state.barrier_mutex);
+ cpus_read_lock();
+ if (!rcu_rdp_is_offloaded(rdp)) {
+ if (cpu_online(cpu)) {
+ ret = work_on_cpu(cpu, rcu_nocb_rdp_offload, rdp);
+ if (!ret)
+ cpumask_set_cpu(cpu, rcu_nocb_mask);
+ } else {
+ pr_info("NOCB: Can't CB-offload an offline CPU\n");
+ ret = -EINVAL;
+ }
+ }
+ cpus_read_unlock();
+ mutex_unlock(&rcu_state.barrier_mutex);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(rcu_nocb_cpu_offload);
+
+void __init rcu_init_nohz(void)
+{
+ int cpu;
+ bool need_rcu_nocb_mask = false;
+ struct rcu_data *rdp;
+
+#if defined(CONFIG_NO_HZ_FULL)
+ if (tick_nohz_full_running && cpumask_weight(tick_nohz_full_mask))
+ need_rcu_nocb_mask = true;
+#endif /* #if defined(CONFIG_NO_HZ_FULL) */
+
+ if (!cpumask_available(rcu_nocb_mask) && need_rcu_nocb_mask) {
+ if (!zalloc_cpumask_var(&rcu_nocb_mask, GFP_KERNEL)) {
+ pr_info("rcu_nocb_mask allocation failed, callback offloading disabled.\n");
+ return;
+ }
+ }
+ if (!cpumask_available(rcu_nocb_mask))
+ return;
+
+#if defined(CONFIG_NO_HZ_FULL)
+ if (tick_nohz_full_running)
+ cpumask_or(rcu_nocb_mask, rcu_nocb_mask, tick_nohz_full_mask);
+#endif /* #if defined(CONFIG_NO_HZ_FULL) */
+
+ if (!cpumask_subset(rcu_nocb_mask, cpu_possible_mask)) {
+ pr_info("\tNote: kernel parameter 'rcu_nocbs=', 'nohz_full', or 'isolcpus=' contains nonexistent CPUs.\n");
+ cpumask_and(rcu_nocb_mask, cpu_possible_mask,
+ rcu_nocb_mask);
+ }
+ if (cpumask_empty(rcu_nocb_mask))
+ pr_info("\tOffload RCU callbacks from CPUs: (none).\n");
+ else
+ pr_info("\tOffload RCU callbacks from CPUs: %*pbl.\n",
+ cpumask_pr_args(rcu_nocb_mask));
+ if (rcu_nocb_poll)
+ pr_info("\tPoll for callbacks from no-CBs CPUs.\n");
+
+ for_each_cpu(cpu, rcu_nocb_mask) {
+ rdp = per_cpu_ptr(&rcu_data, cpu);
+ if (rcu_segcblist_empty(&rdp->cblist))
+ rcu_segcblist_init(&rdp->cblist);
+ rcu_segcblist_offload(&rdp->cblist, true);
+ rcu_segcblist_set_flags(&rdp->cblist, SEGCBLIST_KTHREAD_CB);
+ rcu_segcblist_set_flags(&rdp->cblist, SEGCBLIST_KTHREAD_GP);
+ }
+ rcu_organize_nocb_kthreads();
+}
+
+/* Initialize per-rcu_data variables for no-CBs CPUs. */
+static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
+{
+ init_swait_queue_head(&rdp->nocb_cb_wq);
+ init_swait_queue_head(&rdp->nocb_gp_wq);
+ init_swait_queue_head(&rdp->nocb_state_wq);
+ raw_spin_lock_init(&rdp->nocb_lock);
+ raw_spin_lock_init(&rdp->nocb_bypass_lock);
+ raw_spin_lock_init(&rdp->nocb_gp_lock);
+ timer_setup(&rdp->nocb_timer, do_nocb_deferred_wakeup_timer, 0);
+ rcu_cblist_init(&rdp->nocb_bypass);
+}
+
+/*
+ * If the specified CPU is a no-CBs CPU that does not already have its
+ * rcuo CB kthread, spawn it. Additionally, if the rcuo GP kthread
+ * for this CPU's group has not yet been created, spawn it as well.
+ */
+static void rcu_spawn_one_nocb_kthread(int cpu)
+{
+ struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+ struct rcu_data *rdp_gp;
+ struct task_struct *t;
+
+ /*
+ * If this isn't a no-CBs CPU or if it already has an rcuo kthread,
+ * then nothing to do.
+ */
+ if (!rcu_is_nocb_cpu(cpu) || rdp->nocb_cb_kthread)
+ return;
+
+ /* If we didn't spawn the GP kthread first, reorganize! */
+ rdp_gp = rdp->nocb_gp_rdp;
+ if (!rdp_gp->nocb_gp_kthread) {
+ t = kthread_run(rcu_nocb_gp_kthread, rdp_gp,
+ "rcuog/%d", rdp_gp->cpu);
+ if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo GP kthread, OOM is now expected behavior\n", __func__))
+ return;
+ WRITE_ONCE(rdp_gp->nocb_gp_kthread, t);
+ }
+
+ /* Spawn the kthread for this CPU. */
+ t = kthread_run(rcu_nocb_cb_kthread, rdp,
+ "rcuo%c/%d", rcu_state.abbr, cpu);
+ if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo CB kthread, OOM is now expected behavior\n", __func__))
+ return;
+ WRITE_ONCE(rdp->nocb_cb_kthread, t);
+ WRITE_ONCE(rdp->nocb_gp_kthread, rdp_gp->nocb_gp_kthread);
+}
+
+/*
+ * If the specified CPU is a no-CBs CPU that does not already have its
+ * rcuo kthread, spawn it.
+ */
+static void rcu_spawn_cpu_nocb_kthread(int cpu)
+{
+ if (rcu_scheduler_fully_active)
+ rcu_spawn_one_nocb_kthread(cpu);
+}
+
+/*
+ * Once the scheduler is running, spawn rcuo kthreads for all online
+ * no-CBs CPUs. This assumes that the early_initcall()s happen before
+ * non-boot CPUs come online -- if this changes, we will need to add
+ * some mutual exclusion.
+ */
+static void __init rcu_spawn_nocb_kthreads(void)
+{
+ int cpu;
+
+ for_each_online_cpu(cpu)
+ rcu_spawn_cpu_nocb_kthread(cpu);
+}
+
+/* How many CB CPU IDs per GP kthread? Default of -1 for sqrt(nr_cpu_ids). */
+static int rcu_nocb_gp_stride = -1;
+module_param(rcu_nocb_gp_stride, int, 0444);
+
+/*
+ * Initialize GP-CB relationships for all no-CBs CPU.
+ */
+static void __init rcu_organize_nocb_kthreads(void)
+{
+ int cpu;
+ bool firsttime = true;
+ bool gotnocbs = false;
+ bool gotnocbscbs = true;
+ int ls = rcu_nocb_gp_stride;
+ int nl = 0; /* Next GP kthread. */
+ struct rcu_data *rdp;
+ struct rcu_data *rdp_gp = NULL; /* Suppress misguided gcc warn. */
+ struct rcu_data *rdp_prev = NULL;
+
+ if (!cpumask_available(rcu_nocb_mask))
+ return;
+ if (ls == -1) {
+ ls = nr_cpu_ids / int_sqrt(nr_cpu_ids);
+ rcu_nocb_gp_stride = ls;
+ }
+
+ /*
+ * Each pass through this loop sets up one rcu_data structure.
+ * Should the corresponding CPU come online in the future, then
+ * we will spawn the needed set of rcu_nocb_kthread() kthreads.
+ */
+ for_each_cpu(cpu, rcu_nocb_mask) {
+ rdp = per_cpu_ptr(&rcu_data, cpu);
+ if (rdp->cpu >= nl) {
+ /* New GP kthread, set up for CBs & next GP. */
+ gotnocbs = true;
+ nl = DIV_ROUND_UP(rdp->cpu + 1, ls) * ls;
+ rdp->nocb_gp_rdp = rdp;
+ rdp_gp = rdp;
+ if (dump_tree) {
+ if (!firsttime)
+ pr_cont("%s\n", gotnocbscbs
+ ? "" : " (self only)");
+ gotnocbscbs = false;
+ firsttime = false;
+ pr_alert("%s: No-CB GP kthread CPU %d:",
+ __func__, cpu);
+ }
+ } else {
+ /* Another CB kthread, link to previous GP kthread. */
+ gotnocbscbs = true;
+ rdp->nocb_gp_rdp = rdp_gp;
+ rdp_prev->nocb_next_cb_rdp = rdp;
+ if (dump_tree)
+ pr_cont(" %d", cpu);
+ }
+ rdp_prev = rdp;
+ }
+ if (gotnocbs && dump_tree)
+ pr_cont("%s\n", gotnocbscbs ? "" : " (self only)");
+}
+
+/*
+ * Bind the current task to the offloaded CPUs. If there are no offloaded
+ * CPUs, leave the task unbound. Splat if the bind attempt fails.
+ */
+void rcu_bind_current_to_nocb(void)
+{
+ if (cpumask_available(rcu_nocb_mask) && cpumask_weight(rcu_nocb_mask))
+ WARN_ON(sched_setaffinity(current->pid, rcu_nocb_mask));
+}
+EXPORT_SYMBOL_GPL(rcu_bind_current_to_nocb);
+
+// The ->on_cpu field is available only in CONFIG_SMP=y, so...
+#ifdef CONFIG_SMP
+static char *show_rcu_should_be_on_cpu(struct task_struct *tsp)
+{
+ return tsp && tsp->state == TASK_RUNNING && !tsp->on_cpu ? "!" : "";
+}
+#else // #ifdef CONFIG_SMP
+static char *show_rcu_should_be_on_cpu(struct task_struct *tsp)
+{
+ return "";
+}
+#endif // #else #ifdef CONFIG_SMP
+
+/*
+ * Dump out nocb grace-period kthread state for the specified rcu_data
+ * structure.
+ */
+static void show_rcu_nocb_gp_state(struct rcu_data *rdp)
+{
+ struct rcu_node *rnp = rdp->mynode;
+
+ pr_info("nocb GP %d %c%c%c%c%c %c[%c%c] %c%c:%ld rnp %d:%d %lu %c CPU %d%s\n",
+ rdp->cpu,
+ "kK"[!!rdp->nocb_gp_kthread],
+ "lL"[raw_spin_is_locked(&rdp->nocb_gp_lock)],
+ "dD"[!!rdp->nocb_defer_wakeup],
+ "tT"[timer_pending(&rdp->nocb_timer)],
+ "sS"[!!rdp->nocb_gp_sleep],
+ ".W"[swait_active(&rdp->nocb_gp_wq)],
+ ".W"[swait_active(&rnp->nocb_gp_wq[0])],
+ ".W"[swait_active(&rnp->nocb_gp_wq[1])],
+ ".B"[!!rdp->nocb_gp_bypass],
+ ".G"[!!rdp->nocb_gp_gp],
+ (long)rdp->nocb_gp_seq,
+ rnp->grplo, rnp->grphi, READ_ONCE(rdp->nocb_gp_loops),
+ rdp->nocb_gp_kthread ? task_state_to_char(rdp->nocb_gp_kthread) : '.',
+ rdp->nocb_cb_kthread ? (int)task_cpu(rdp->nocb_gp_kthread) : -1,
+ show_rcu_should_be_on_cpu(rdp->nocb_cb_kthread));
+}
+
+/* Dump out nocb kthread state for the specified rcu_data structure. */
+static void show_rcu_nocb_state(struct rcu_data *rdp)
+{
+ char bufw[20];
+ char bufr[20];
+ struct rcu_segcblist *rsclp = &rdp->cblist;
+ bool waslocked;
+ bool wassleep;
+
+ if (rdp->nocb_gp_rdp == rdp)
+ show_rcu_nocb_gp_state(rdp);
+
+ sprintf(bufw, "%ld", rsclp->gp_seq[RCU_WAIT_TAIL]);
+ sprintf(bufr, "%ld", rsclp->gp_seq[RCU_NEXT_READY_TAIL]);
+ pr_info(" CB %d^%d->%d %c%c%c%c%c%c F%ld L%ld C%d %c%c%s%c%s%c%c q%ld %c CPU %d%s\n",
+ rdp->cpu, rdp->nocb_gp_rdp->cpu,
+ rdp->nocb_next_cb_rdp ? rdp->nocb_next_cb_rdp->cpu : -1,
+ "kK"[!!rdp->nocb_cb_kthread],
+ "bB"[raw_spin_is_locked(&rdp->nocb_bypass_lock)],
+ "cC"[!!atomic_read(&rdp->nocb_lock_contended)],
+ "lL"[raw_spin_is_locked(&rdp->nocb_lock)],
+ "sS"[!!rdp->nocb_cb_sleep],
+ ".W"[swait_active(&rdp->nocb_cb_wq)],
+ jiffies - rdp->nocb_bypass_first,
+ jiffies - rdp->nocb_nobypass_last,
+ rdp->nocb_nobypass_count,
+ ".D"[rcu_segcblist_ready_cbs(rsclp)],
+ ".W"[!rcu_segcblist_segempty(rsclp, RCU_WAIT_TAIL)],
+ rcu_segcblist_segempty(rsclp, RCU_WAIT_TAIL) ? "" : bufw,
+ ".R"[!rcu_segcblist_segempty(rsclp, RCU_NEXT_READY_TAIL)],
+ rcu_segcblist_segempty(rsclp, RCU_NEXT_READY_TAIL) ? "" : bufr,
+ ".N"[!rcu_segcblist_segempty(rsclp, RCU_NEXT_TAIL)],
+ ".B"[!!rcu_cblist_n_cbs(&rdp->nocb_bypass)],
+ rcu_segcblist_n_cbs(&rdp->cblist),
+ rdp->nocb_cb_kthread ? task_state_to_char(rdp->nocb_cb_kthread) : '.',
+ rdp->nocb_cb_kthread ? (int)task_cpu(rdp->nocb_gp_kthread) : -1,
+ show_rcu_should_be_on_cpu(rdp->nocb_cb_kthread));
+
+ /* It is OK for GP kthreads to have GP state. */
+ if (rdp->nocb_gp_rdp == rdp)
+ return;
+
+ waslocked = raw_spin_is_locked(&rdp->nocb_gp_lock);
+ wassleep = swait_active(&rdp->nocb_gp_wq);
+ if (!rdp->nocb_gp_sleep && !waslocked && !wassleep)
+ return; /* Nothing untoward. */
+
+ pr_info(" nocb GP activity on CB-only CPU!!! %c%c%c %c\n",
+ "lL"[waslocked],
+ "dD"[!!rdp->nocb_defer_wakeup],
+ "sS"[!!rdp->nocb_gp_sleep],
+ ".W"[wassleep]);
+}
+
+#else /* #ifdef CONFIG_RCU_NOCB_CPU */
+
+static inline int rcu_lockdep_is_held_nocb(struct rcu_data *rdp)
+{
+ return 0;
+}
+
+static inline bool rcu_current_is_nocb_kthread(struct rcu_data *rdp)
+{
+ return false;
+}
+
+/* No ->nocb_lock to acquire. */
+static void rcu_nocb_lock(struct rcu_data *rdp)
+{
+}
+
+/* No ->nocb_lock to release. */
+static void rcu_nocb_unlock(struct rcu_data *rdp)
+{
+}
+
+/* No ->nocb_lock to release. */
+static void rcu_nocb_unlock_irqrestore(struct rcu_data *rdp,
+ unsigned long flags)
+{
+ local_irq_restore(flags);
+}
+
+/* Lockdep check that ->cblist may be safely accessed. */
+static void rcu_lockdep_assert_cblist_protected(struct rcu_data *rdp)
+{
+ lockdep_assert_irqs_disabled();
+}
+
+static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq)
+{
+}
+
+static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp)
+{
+ return NULL;
+}
+
+static void rcu_init_one_nocb(struct rcu_node *rnp)
+{
+}
+
+static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
+ unsigned long j)
+{
+ return true;
+}
+
+static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
+ bool *was_alldone, unsigned long flags)
+{
+ return false;
+}
+
+static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_empty,
+ unsigned long flags)
+{
+ WARN_ON_ONCE(1); /* Should be dead code! */
+}
+
+static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
+{
+}
+
+static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp, int level)
+{
+ return false;
+}
+
+static bool do_nocb_deferred_wakeup(struct rcu_data *rdp)
+{
+ return false;
+}
+
+static void rcu_spawn_cpu_nocb_kthread(int cpu)
+{
+}
+
+static void __init rcu_spawn_nocb_kthreads(void)
+{
+}
+
+static void show_rcu_nocb_state(struct rcu_data *rdp)
+{
+}
+
+#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index ad0156b869371..27b74352cccf0 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -13,48 +13,6 @@
#include "../locking/rtmutex_common.h"
-#ifdef CONFIG_RCU_NOCB_CPU
-static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */
-static bool __read_mostly rcu_nocb_poll; /* Offload kthread are to poll. */
-static inline int rcu_lockdep_is_held_nocb(struct rcu_data *rdp)
-{
- return lockdep_is_held(&rdp->nocb_lock);
-}
-
-static inline bool rcu_current_is_nocb_kthread(struct rcu_data *rdp)
-{
- /* Race on early boot between thread creation and assignment */
- if (!rdp->nocb_cb_kthread || !rdp->nocb_gp_kthread)
- return true;
-
- if (current == rdp->nocb_cb_kthread || current == rdp->nocb_gp_kthread)
- if (in_task())
- return true;
- return false;
-}
-
-static inline bool rcu_running_nocb_timer(struct rcu_data *rdp)
-{
- return (timer_curr_running(&rdp->nocb_timer) && !in_irq());
-}
-#else
-static inline int rcu_lockdep_is_held_nocb(struct rcu_data *rdp)
-{
- return 0;
-}
-
-static inline bool rcu_current_is_nocb_kthread(struct rcu_data *rdp)
-{
- return false;
-}
-
-static inline bool rcu_running_nocb_timer(struct rcu_data *rdp)
-{
- return false;
-}
-
-#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
-
static bool rcu_rdp_is_offloaded(struct rcu_data *rdp)
{
/*
@@ -72,8 +30,7 @@ static bool rcu_rdp_is_offloaded(struct rcu_data *rdp)
rcu_lockdep_is_held_nocb(rdp) ||
(rdp == this_cpu_ptr(&rcu_data) &&
!(IS_ENABLED(CONFIG_PREEMPT_COUNT) && preemptible())) ||
- rcu_current_is_nocb_kthread(rdp) ||
- rcu_running_nocb_timer(rdp)),
+ rcu_current_is_nocb_kthread(rdp)),
"Unsafe read of RCU_NOCB offloaded state"
);
@@ -415,17 +372,20 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
static void rcu_preempt_read_enter(void)
{
- current->rcu_read_lock_nesting++;
+ WRITE_ONCE(current->rcu_read_lock_nesting, READ_ONCE(current->rcu_read_lock_nesting) + 1);
}
static int rcu_preempt_read_exit(void)
{
- return --current->rcu_read_lock_nesting;
+ int ret = READ_ONCE(current->rcu_read_lock_nesting) - 1;
+
+ WRITE_ONCE(current->rcu_read_lock_nesting, ret);
+ return ret;
}
static void rcu_preempt_depth_set(int val)
{
- current->rcu_read_lock_nesting = val;
+ WRITE_ONCE(current->rcu_read_lock_nesting, val);
}
/*
@@ -1098,6 +1058,7 @@ static int rcu_boost(struct rcu_node *rnp)
/* Lock only for side effect: boosts task t's priority. */
rt_mutex_lock(&rnp->boost_mtx);
rt_mutex_unlock(&rnp->boost_mtx); /* Then keep lockdep happy. */
+ rnp->n_boosts++;
return READ_ONCE(rnp->exp_tasks) != NULL ||
READ_ONCE(rnp->boost_tasks) != NULL;
@@ -1197,22 +1158,16 @@ static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
*/
static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp)
{
- int rnp_index = rnp - rcu_get_root();
unsigned long flags;
+ int rnp_index = rnp - rcu_get_root();
struct sched_param sp;
struct task_struct *t;
- if (!IS_ENABLED(CONFIG_PREEMPT_RCU))
- return;
-
- if (!rcu_scheduler_fully_active || rcu_rnp_online_cpus(rnp) == 0)
+ if (rnp->boost_kthread_task || !rcu_scheduler_fully_active)
return;
rcu_state.boost = 1;
- if (rnp->boost_kthread_task != NULL)
- return;
-
t = kthread_create(rcu_boost_kthread, (void *)rnp,
"rcub/%d", rnp_index);
if (WARN_ON_ONCE(IS_ERR(t)))
@@ -1264,17 +1219,8 @@ static void __init rcu_spawn_boost_kthreads(void)
struct rcu_node *rnp;
rcu_for_each_leaf_node(rnp)
- rcu_spawn_one_boost_kthread(rnp);
-}
-
-static void rcu_prepare_kthreads(int cpu)
-{
- struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
- struct rcu_node *rnp = rdp->mynode;
-
- /* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */
- if (rcu_scheduler_fully_active)
- rcu_spawn_one_boost_kthread(rnp);
+ if (rcu_rnp_online_cpus(rnp))
+ rcu_spawn_one_boost_kthread(rnp);
}
#else /* #ifdef CONFIG_RCU_BOOST */
@@ -1294,15 +1240,15 @@ static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
{
}
-static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
+static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp)
{
}
-static void __init rcu_spawn_boost_kthreads(void)
+static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
{
}
-static void rcu_prepare_kthreads(int cpu)
+static void __init rcu_spawn_boost_kthreads(void)
{
}
@@ -1503,1449 +1449,6 @@ static void rcu_cleanup_after_idle(void)
#endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */
-#ifdef CONFIG_RCU_NOCB_CPU
-
-/*
- * Offload callback processing from the boot-time-specified set of CPUs
- * specified by rcu_nocb_mask. For the CPUs in the set, there are kthreads
- * created that pull the callbacks from the corresponding CPU, wait for
- * a grace period to elapse, and invoke the callbacks. These kthreads
- * are organized into GP kthreads, which manage incoming callbacks, wait for
- * grace periods, and awaken CB kthreads, and the CB kthreads, which only
- * invoke callbacks. Each GP kthread invokes its own CBs. The no-CBs CPUs
- * do a wake_up() on their GP kthread when they insert a callback into any
- * empty list, unless the rcu_nocb_poll boot parameter has been specified,
- * in which case each kthread actively polls its CPU. (Which isn't so great
- * for energy efficiency, but which does reduce RCU's overhead on that CPU.)
- *
- * This is intended to be used in conjunction with Frederic Weisbecker's
- * adaptive-idle work, which would seriously reduce OS jitter on CPUs
- * running CPU-bound user-mode computations.
- *
- * Offloading of callbacks can also be used as an energy-efficiency
- * measure because CPUs with no RCU callbacks queued are more aggressive
- * about entering dyntick-idle mode.
- */
-
-
-/*
- * Parse the boot-time rcu_nocb_mask CPU list from the kernel parameters.
- * If the list is invalid, a warning is emitted and all CPUs are offloaded.
- */
-static int __init rcu_nocb_setup(char *str)
-{
- alloc_bootmem_cpumask_var(&rcu_nocb_mask);
- if (!strcasecmp(str, "all")) /* legacy: use "0-N" instead */
- cpumask_setall(rcu_nocb_mask);
- else
- if (cpulist_parse(str, rcu_nocb_mask)) {
- pr_warn("rcu_nocbs= bad CPU range, all CPUs set\n");
- cpumask_setall(rcu_nocb_mask);
- }
- return 1;
-}
-__setup("rcu_nocbs=", rcu_nocb_setup);
-
-static int __init parse_rcu_nocb_poll(char *arg)
-{
- rcu_nocb_poll = true;
- return 0;
-}
-early_param("rcu_nocb_poll", parse_rcu_nocb_poll);
-
-/*
- * Don't bother bypassing ->cblist if the call_rcu() rate is low.
- * After all, the main point of bypassing is to avoid lock contention
- * on ->nocb_lock, which only can happen at high call_rcu() rates.
- */
-static int nocb_nobypass_lim_per_jiffy = 16 * 1000 / HZ;
-module_param(nocb_nobypass_lim_per_jiffy, int, 0);
-
-/*
- * Acquire the specified rcu_data structure's ->nocb_bypass_lock. If the
- * lock isn't immediately available, increment ->nocb_lock_contended to
- * flag the contention.
- */
-static void rcu_nocb_bypass_lock(struct rcu_data *rdp)
- __acquires(&rdp->nocb_bypass_lock)
-{
- lockdep_assert_irqs_disabled();
- if (raw_spin_trylock(&rdp->nocb_bypass_lock))
- return;
- atomic_inc(&rdp->nocb_lock_contended);
- WARN_ON_ONCE(smp_processor_id() != rdp->cpu);
- smp_mb__after_atomic(); /* atomic_inc() before lock. */
- raw_spin_lock(&rdp->nocb_bypass_lock);
- smp_mb__before_atomic(); /* atomic_dec() after lock. */
- atomic_dec(&rdp->nocb_lock_contended);
-}
-
-/*
- * Spinwait until the specified rcu_data structure's ->nocb_lock is
- * not contended. Please note that this is extremely special-purpose,
- * relying on the fact that at most two kthreads and one CPU contend for
- * this lock, and also that the two kthreads are guaranteed to have frequent
- * grace-period-duration time intervals between successive acquisitions
- * of the lock. This allows us to use an extremely simple throttling
- * mechanism, and further to apply it only to the CPU doing floods of
- * call_rcu() invocations. Don't try this at home!
- */
-static void rcu_nocb_wait_contended(struct rcu_data *rdp)
-{
- WARN_ON_ONCE(smp_processor_id() != rdp->cpu);
- while (WARN_ON_ONCE(atomic_read(&rdp->nocb_lock_contended)))
- cpu_relax();
-}
-
-/*
- * Conditionally acquire the specified rcu_data structure's
- * ->nocb_bypass_lock.
- */
-static bool rcu_nocb_bypass_trylock(struct rcu_data *rdp)
-{
- lockdep_assert_irqs_disabled();
- return raw_spin_trylock(&rdp->nocb_bypass_lock);
-}
-
-/*
- * Release the specified rcu_data structure's ->nocb_bypass_lock.
- */
-static void rcu_nocb_bypass_unlock(struct rcu_data *rdp)
- __releases(&rdp->nocb_bypass_lock)
-{
- lockdep_assert_irqs_disabled();
- raw_spin_unlock(&rdp->nocb_bypass_lock);
-}
-
-/*
- * Acquire the specified rcu_data structure's ->nocb_lock, but only
- * if it corresponds to a no-CBs CPU.
- */
-static void rcu_nocb_lock(struct rcu_data *rdp)
-{
- lockdep_assert_irqs_disabled();
- if (!rcu_rdp_is_offloaded(rdp))
- return;
- raw_spin_lock(&rdp->nocb_lock);
-}
-
-/*
- * Release the specified rcu_data structure's ->nocb_lock, but only
- * if it corresponds to a no-CBs CPU.
- */
-static void rcu_nocb_unlock(struct rcu_data *rdp)
-{
- if (rcu_rdp_is_offloaded(rdp)) {
- lockdep_assert_irqs_disabled();
- raw_spin_unlock(&rdp->nocb_lock);
- }
-}
-
-/*
- * Release the specified rcu_data structure's ->nocb_lock and restore
- * interrupts, but only if it corresponds to a no-CBs CPU.
- */
-static void rcu_nocb_unlock_irqrestore(struct rcu_data *rdp,
- unsigned long flags)
-{
- if (rcu_rdp_is_offloaded(rdp)) {
- lockdep_assert_irqs_disabled();
- raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
- } else {
- local_irq_restore(flags);
- }
-}
-
-/* Lockdep check that ->cblist may be safely accessed. */
-static void rcu_lockdep_assert_cblist_protected(struct rcu_data *rdp)
-{
- lockdep_assert_irqs_disabled();
- if (rcu_rdp_is_offloaded(rdp))
- lockdep_assert_held(&rdp->nocb_lock);
-}
-
-/*
- * Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended
- * grace period.
- */
-static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq)
-{
- swake_up_all(sq);
-}
-
-static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp)
-{
- return &rnp->nocb_gp_wq[rcu_seq_ctr(rnp->gp_seq) & 0x1];
-}
-
-static void rcu_init_one_nocb(struct rcu_node *rnp)
-{
- init_swait_queue_head(&rnp->nocb_gp_wq[0]);
- init_swait_queue_head(&rnp->nocb_gp_wq[1]);
-}
-
-/* Is the specified CPU a no-CBs CPU? */
-bool rcu_is_nocb_cpu(int cpu)
-{
- if (cpumask_available(rcu_nocb_mask))
- return cpumask_test_cpu(cpu, rcu_nocb_mask);
- return false;
-}
-
-/*
- * Kick the GP kthread for this NOCB group. Caller holds ->nocb_lock
- * and this function releases it.
- */
-static bool wake_nocb_gp(struct rcu_data *rdp, bool force,
- unsigned long flags)
- __releases(rdp->nocb_lock)
-{
- bool needwake = false;
- struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
-
- lockdep_assert_held(&rdp->nocb_lock);
- if (!READ_ONCE(rdp_gp->nocb_gp_kthread)) {
- rcu_nocb_unlock_irqrestore(rdp, flags);
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
- TPS("AlreadyAwake"));
- return false;
- }
-
- if (READ_ONCE(rdp->nocb_defer_wakeup) > RCU_NOCB_WAKE_NOT) {
- WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT);
- del_timer(&rdp->nocb_timer);
- }
- rcu_nocb_unlock_irqrestore(rdp, flags);
- raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags);
- if (force || READ_ONCE(rdp_gp->nocb_gp_sleep)) {
- WRITE_ONCE(rdp_gp->nocb_gp_sleep, false);
- needwake = true;
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DoWake"));
- }
- raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags);
- if (needwake)
- wake_up_process(rdp_gp->nocb_gp_kthread);
-
- return needwake;
-}
-
-/*
- * Arrange to wake the GP kthread for this NOCB group at some future
- * time when it is safe to do so.
- */
-static void wake_nocb_gp_defer(struct rcu_data *rdp, int waketype,
- const char *reason)
-{
- if (rdp->nocb_defer_wakeup == RCU_NOCB_WAKE_OFF)
- return;
- if (rdp->nocb_defer_wakeup == RCU_NOCB_WAKE_NOT)
- mod_timer(&rdp->nocb_timer, jiffies + 1);
- if (rdp->nocb_defer_wakeup < waketype)
- WRITE_ONCE(rdp->nocb_defer_wakeup, waketype);
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, reason);
-}
-
-/*
- * Flush the ->nocb_bypass queue into ->cblist, enqueuing rhp if non-NULL.
- * However, if there is a callback to be enqueued and if ->nocb_bypass
- * proves to be initially empty, just return false because the no-CB GP
- * kthread may need to be awakened in this case.
- *
- * Note that this function always returns true if rhp is NULL.
- */
-static bool rcu_nocb_do_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
- unsigned long j)
-{
- struct rcu_cblist rcl;
-
- WARN_ON_ONCE(!rcu_rdp_is_offloaded(rdp));
- rcu_lockdep_assert_cblist_protected(rdp);
- lockdep_assert_held(&rdp->nocb_bypass_lock);
- if (rhp && !rcu_cblist_n_cbs(&rdp->nocb_bypass)) {
- raw_spin_unlock(&rdp->nocb_bypass_lock);
- return false;
- }
- /* Note: ->cblist.len already accounts for ->nocb_bypass contents. */
- if (rhp)
- rcu_segcblist_inc_len(&rdp->cblist); /* Must precede enqueue. */
- rcu_cblist_flush_enqueue(&rcl, &rdp->nocb_bypass, rhp);
- rcu_segcblist_insert_pend_cbs(&rdp->cblist, &rcl);
- WRITE_ONCE(rdp->nocb_bypass_first, j);
- rcu_nocb_bypass_unlock(rdp);
- return true;
-}
-
-/*
- * Flush the ->nocb_bypass queue into ->cblist, enqueuing rhp if non-NULL.
- * However, if there is a callback to be enqueued and if ->nocb_bypass
- * proves to be initially empty, just return false because the no-CB GP
- * kthread may need to be awakened in this case.
- *
- * Note that this function always returns true if rhp is NULL.
- */
-static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
- unsigned long j)
-{
- if (!rcu_rdp_is_offloaded(rdp))
- return true;
- rcu_lockdep_assert_cblist_protected(rdp);
- rcu_nocb_bypass_lock(rdp);
- return rcu_nocb_do_flush_bypass(rdp, rhp, j);
-}
-
-/*
- * If the ->nocb_bypass_lock is immediately available, flush the
- * ->nocb_bypass queue into ->cblist.
- */
-static void rcu_nocb_try_flush_bypass(struct rcu_data *rdp, unsigned long j)
-{
- rcu_lockdep_assert_cblist_protected(rdp);
- if (!rcu_rdp_is_offloaded(rdp) ||
- !rcu_nocb_bypass_trylock(rdp))
- return;
- WARN_ON_ONCE(!rcu_nocb_do_flush_bypass(rdp, NULL, j));
-}
-
-/*
- * See whether it is appropriate to use the ->nocb_bypass list in order
- * to control contention on ->nocb_lock. A limited number of direct
- * enqueues are permitted into ->cblist per jiffy. If ->nocb_bypass
- * is non-empty, further callbacks must be placed into ->nocb_bypass,
- * otherwise rcu_barrier() breaks. Use rcu_nocb_flush_bypass() to switch
- * back to direct use of ->cblist. However, ->nocb_bypass should not be
- * used if ->cblist is empty, because otherwise callbacks can be stranded
- * on ->nocb_bypass because we cannot count on the current CPU ever again
- * invoking call_rcu(). The general rule is that if ->nocb_bypass is
- * non-empty, the corresponding no-CBs grace-period kthread must not be
- * in an indefinite sleep state.
- *
- * Finally, it is not permitted to use the bypass during early boot,
- * as doing so would confuse the auto-initialization code. Besides
- * which, there is no point in worrying about lock contention while
- * there is only one CPU in operation.
- */
-static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
- bool *was_alldone, unsigned long flags)
-{
- unsigned long c;
- unsigned long cur_gp_seq;
- unsigned long j = jiffies;
- long ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
-
- lockdep_assert_irqs_disabled();
-
- // Pure softirq/rcuc based processing: no bypassing, no
- // locking.
- if (!rcu_rdp_is_offloaded(rdp)) {
- *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
- return false;
- }
-
- // In the process of (de-)offloading: no bypassing, but
- // locking.
- if (!rcu_segcblist_completely_offloaded(&rdp->cblist)) {
- rcu_nocb_lock(rdp);
- *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
- return false; /* Not offloaded, no bypassing. */
- }
-
- // Don't use ->nocb_bypass during early boot.
- if (rcu_scheduler_active != RCU_SCHEDULER_RUNNING) {
- rcu_nocb_lock(rdp);
- WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass));
- *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
- return false;
- }
-
- // If we have advanced to a new jiffy, reset counts to allow
- // moving back from ->nocb_bypass to ->cblist.
- if (j == rdp->nocb_nobypass_last) {
- c = rdp->nocb_nobypass_count + 1;
- } else {
- WRITE_ONCE(rdp->nocb_nobypass_last, j);
- c = rdp->nocb_nobypass_count - nocb_nobypass_lim_per_jiffy;
- if (ULONG_CMP_LT(rdp->nocb_nobypass_count,
- nocb_nobypass_lim_per_jiffy))
- c = 0;
- else if (c > nocb_nobypass_lim_per_jiffy)
- c = nocb_nobypass_lim_per_jiffy;
- }
- WRITE_ONCE(rdp->nocb_nobypass_count, c);
-
- // If there hasn't yet been all that many ->cblist enqueues
- // this jiffy, tell the caller to enqueue onto ->cblist. But flush
- // ->nocb_bypass first.
- if (rdp->nocb_nobypass_count < nocb_nobypass_lim_per_jiffy) {
- rcu_nocb_lock(rdp);
- *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
- if (*was_alldone)
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
- TPS("FirstQ"));
- WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, j));
- WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass));
- return false; // Caller must enqueue the callback.
- }
-
- // If ->nocb_bypass has been used too long or is too full,
- // flush ->nocb_bypass to ->cblist.
- if ((ncbs && j != READ_ONCE(rdp->nocb_bypass_first)) ||
- ncbs >= qhimark) {
- rcu_nocb_lock(rdp);
- if (!rcu_nocb_flush_bypass(rdp, rhp, j)) {
- *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
- if (*was_alldone)
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
- TPS("FirstQ"));
- WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass));
- return false; // Caller must enqueue the callback.
- }
- if (j != rdp->nocb_gp_adv_time &&
- rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) &&
- rcu_seq_done(&rdp->mynode->gp_seq, cur_gp_seq)) {
- rcu_advance_cbs_nowake(rdp->mynode, rdp);
- rdp->nocb_gp_adv_time = j;
- }
- rcu_nocb_unlock_irqrestore(rdp, flags);
- return true; // Callback already enqueued.
- }
-
- // We need to use the bypass.
- rcu_nocb_wait_contended(rdp);
- rcu_nocb_bypass_lock(rdp);
- ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
- rcu_segcblist_inc_len(&rdp->cblist); /* Must precede enqueue. */
- rcu_cblist_enqueue(&rdp->nocb_bypass, rhp);
- if (!ncbs) {
- WRITE_ONCE(rdp->nocb_bypass_first, j);
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("FirstBQ"));
- }
- rcu_nocb_bypass_unlock(rdp);
- smp_mb(); /* Order enqueue before wake. */
- if (ncbs) {
- local_irq_restore(flags);
- } else {
- // No-CBs GP kthread might be indefinitely asleep, if so, wake.
- rcu_nocb_lock(rdp); // Rare during call_rcu() flood.
- if (!rcu_segcblist_pend_cbs(&rdp->cblist)) {
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
- TPS("FirstBQwake"));
- __call_rcu_nocb_wake(rdp, true, flags);
- } else {
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
- TPS("FirstBQnoWake"));
- rcu_nocb_unlock_irqrestore(rdp, flags);
- }
- }
- return true; // Callback already enqueued.
-}
-
-/*
- * Awaken the no-CBs grace-period kthead if needed, either due to it
- * legitimately being asleep or due to overload conditions.
- *
- * If warranted, also wake up the kthread servicing this CPUs queues.
- */
-static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone,
- unsigned long flags)
- __releases(rdp->nocb_lock)
-{
- unsigned long cur_gp_seq;
- unsigned long j;
- long len;
- struct task_struct *t;
-
- // If we are being polled or there is no kthread, just leave.
- t = READ_ONCE(rdp->nocb_gp_kthread);
- if (rcu_nocb_poll || !t) {
- rcu_nocb_unlock_irqrestore(rdp, flags);
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
- TPS("WakeNotPoll"));
- return;
- }
- // Need to actually to a wakeup.
- len = rcu_segcblist_n_cbs(&rdp->cblist);
- if (was_alldone) {
- rdp->qlen_last_fqs_check = len;
- if (!irqs_disabled_flags(flags)) {
- /* ... if queue was empty ... */
- wake_nocb_gp(rdp, false, flags);
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
- TPS("WakeEmpty"));
- } else {
- wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE,
- TPS("WakeEmptyIsDeferred"));
- rcu_nocb_unlock_irqrestore(rdp, flags);
- }
- } else if (len > rdp->qlen_last_fqs_check + qhimark) {
- /* ... or if many callbacks queued. */
- rdp->qlen_last_fqs_check = len;
- j = jiffies;
- if (j != rdp->nocb_gp_adv_time &&
- rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) &&
- rcu_seq_done(&rdp->mynode->gp_seq, cur_gp_seq)) {
- rcu_advance_cbs_nowake(rdp->mynode, rdp);
- rdp->nocb_gp_adv_time = j;
- }
- smp_mb(); /* Enqueue before timer_pending(). */
- if ((rdp->nocb_cb_sleep ||
- !rcu_segcblist_ready_cbs(&rdp->cblist)) &&
- !timer_pending(&rdp->nocb_bypass_timer))
- wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE_FORCE,
- TPS("WakeOvfIsDeferred"));
- rcu_nocb_unlock_irqrestore(rdp, flags);
- } else {
- rcu_nocb_unlock_irqrestore(rdp, flags);
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNot"));
- }
- return;
-}
-
-/* Wake up the no-CBs GP kthread to flush ->nocb_bypass. */
-static void do_nocb_bypass_wakeup_timer(struct timer_list *t)
-{
- unsigned long flags;
- struct rcu_data *rdp = from_timer(rdp, t, nocb_bypass_timer);
-
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Timer"));
- rcu_nocb_lock_irqsave(rdp, flags);
- smp_mb__after_spinlock(); /* Timer expire before wakeup. */
- __call_rcu_nocb_wake(rdp, true, flags);
-}
-
-/*
- * Check if we ignore this rdp.
- *
- * We check that without holding the nocb lock but
- * we make sure not to miss a freshly offloaded rdp
- * with the current ordering:
- *
- * rdp_offload_toggle() nocb_gp_enabled_cb()
- * ------------------------- ----------------------------
- * WRITE flags LOCK nocb_gp_lock
- * LOCK nocb_gp_lock READ/WRITE nocb_gp_sleep
- * READ/WRITE nocb_gp_sleep UNLOCK nocb_gp_lock
- * UNLOCK nocb_gp_lock READ flags
- */
-static inline bool nocb_gp_enabled_cb(struct rcu_data *rdp)
-{
- u8 flags = SEGCBLIST_OFFLOADED | SEGCBLIST_KTHREAD_GP;
-
- return rcu_segcblist_test_flags(&rdp->cblist, flags);
-}
-
-static inline bool nocb_gp_update_state_deoffloading(struct rcu_data *rdp,
- bool *needwake_state)
-{
- struct rcu_segcblist *cblist = &rdp->cblist;
-
- if (rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED)) {
- if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)) {
- rcu_segcblist_set_flags(cblist, SEGCBLIST_KTHREAD_GP);
- if (rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB))
- *needwake_state = true;
- }
- return false;
- }
-
- /*
- * De-offloading. Clear our flag and notify the de-offload worker.
- * We will ignore this rdp until it ever gets re-offloaded.
- */
- WARN_ON_ONCE(!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP));
- rcu_segcblist_clear_flags(cblist, SEGCBLIST_KTHREAD_GP);
- if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB))
- *needwake_state = true;
- return true;
-}
-
-
-/*
- * No-CBs GP kthreads come here to wait for additional callbacks to show up
- * or for grace periods to end.
- */
-static void nocb_gp_wait(struct rcu_data *my_rdp)
-{
- bool bypass = false;
- long bypass_ncbs;
- int __maybe_unused cpu = my_rdp->cpu;
- unsigned long cur_gp_seq;
- unsigned long flags;
- bool gotcbs = false;
- unsigned long j = jiffies;
- bool needwait_gp = false; // This prevents actual uninitialized use.
- bool needwake;
- bool needwake_gp;
- struct rcu_data *rdp;
- struct rcu_node *rnp;
- unsigned long wait_gp_seq = 0; // Suppress "use uninitialized" warning.
- bool wasempty = false;
-
- /*
- * Each pass through the following loop checks for CBs and for the
- * nearest grace period (if any) to wait for next. The CB kthreads
- * and the global grace-period kthread are awakened if needed.
- */
- WARN_ON_ONCE(my_rdp->nocb_gp_rdp != my_rdp);
- for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_cb_rdp) {
- bool needwake_state = false;
-
- if (!nocb_gp_enabled_cb(rdp))
- continue;
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Check"));
- rcu_nocb_lock_irqsave(rdp, flags);
- if (nocb_gp_update_state_deoffloading(rdp, &needwake_state)) {
- rcu_nocb_unlock_irqrestore(rdp, flags);
- if (needwake_state)
- swake_up_one(&rdp->nocb_state_wq);
- continue;
- }
- bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
- if (bypass_ncbs &&
- (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + 1) ||
- bypass_ncbs > 2 * qhimark)) {
- // Bypass full or old, so flush it.
- (void)rcu_nocb_try_flush_bypass(rdp, j);
- bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
- } else if (!bypass_ncbs && rcu_segcblist_empty(&rdp->cblist)) {
- rcu_nocb_unlock_irqrestore(rdp, flags);
- if (needwake_state)
- swake_up_one(&rdp->nocb_state_wq);
- continue; /* No callbacks here, try next. */
- }
- if (bypass_ncbs) {
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
- TPS("Bypass"));
- bypass = true;
- }
- rnp = rdp->mynode;
- if (bypass) { // Avoid race with first bypass CB.
- WRITE_ONCE(my_rdp->nocb_defer_wakeup,
- RCU_NOCB_WAKE_NOT);
- del_timer(&my_rdp->nocb_timer);
- }
- // Advance callbacks if helpful and low contention.
- needwake_gp = false;
- if (!rcu_segcblist_restempty(&rdp->cblist,
- RCU_NEXT_READY_TAIL) ||
- (rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) &&
- rcu_seq_done(&rnp->gp_seq, cur_gp_seq))) {
- raw_spin_lock_rcu_node(rnp); /* irqs disabled. */
- needwake_gp = rcu_advance_cbs(rnp, rdp);
- wasempty = rcu_segcblist_restempty(&rdp->cblist,
- RCU_NEXT_READY_TAIL);
- raw_spin_unlock_rcu_node(rnp); /* irqs disabled. */
- }
- // Need to wait on some grace period?
- WARN_ON_ONCE(wasempty &&
- !rcu_segcblist_restempty(&rdp->cblist,
- RCU_NEXT_READY_TAIL));
- if (rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq)) {
- if (!needwait_gp ||
- ULONG_CMP_LT(cur_gp_seq, wait_gp_seq))
- wait_gp_seq = cur_gp_seq;
- needwait_gp = true;
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
- TPS("NeedWaitGP"));
- }
- if (rcu_segcblist_ready_cbs(&rdp->cblist)) {
- needwake = rdp->nocb_cb_sleep;
- WRITE_ONCE(rdp->nocb_cb_sleep, false);
- smp_mb(); /* CB invocation -after- GP end. */
- } else {
- needwake = false;
- }
- rcu_nocb_unlock_irqrestore(rdp, flags);
- if (needwake) {
- swake_up_one(&rdp->nocb_cb_wq);
- gotcbs = true;
- }
- if (needwake_gp)
- rcu_gp_kthread_wake();
- if (needwake_state)
- swake_up_one(&rdp->nocb_state_wq);
- }
-
- my_rdp->nocb_gp_bypass = bypass;
- my_rdp->nocb_gp_gp = needwait_gp;
- my_rdp->nocb_gp_seq = needwait_gp ? wait_gp_seq : 0;
- if (bypass && !rcu_nocb_poll) {
- // At least one child with non-empty ->nocb_bypass, so set
- // timer in order to avoid stranding its callbacks.
- raw_spin_lock_irqsave(&my_rdp->nocb_gp_lock, flags);
- mod_timer(&my_rdp->nocb_bypass_timer, j + 2);
- raw_spin_unlock_irqrestore(&my_rdp->nocb_gp_lock, flags);
- }
- if (rcu_nocb_poll) {
- /* Polling, so trace if first poll in the series. */
- if (gotcbs)
- trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("Poll"));
- schedule_timeout_idle(1);
- } else if (!needwait_gp) {
- /* Wait for callbacks to appear. */
- trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("Sleep"));
- swait_event_interruptible_exclusive(my_rdp->nocb_gp_wq,
- !READ_ONCE(my_rdp->nocb_gp_sleep));
- trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("EndSleep"));
- } else {
- rnp = my_rdp->mynode;
- trace_rcu_this_gp(rnp, my_rdp, wait_gp_seq, TPS("StartWait"));
- swait_event_interruptible_exclusive(
- rnp->nocb_gp_wq[rcu_seq_ctr(wait_gp_seq) & 0x1],
- rcu_seq_done(&rnp->gp_seq, wait_gp_seq) ||
- !READ_ONCE(my_rdp->nocb_gp_sleep));
- trace_rcu_this_gp(rnp, my_rdp, wait_gp_seq, TPS("EndWait"));
- }
- if (!rcu_nocb_poll) {
- raw_spin_lock_irqsave(&my_rdp->nocb_gp_lock, flags);
- if (bypass)
- del_timer(&my_rdp->nocb_bypass_timer);
- WRITE_ONCE(my_rdp->nocb_gp_sleep, true);
- raw_spin_unlock_irqrestore(&my_rdp->nocb_gp_lock, flags);
- }
- my_rdp->nocb_gp_seq = -1;
- WARN_ON(signal_pending(current));
-}
-
-/*
- * No-CBs grace-period-wait kthread. There is one of these per group
- * of CPUs, but only once at least one CPU in that group has come online
- * at least once since boot. This kthread checks for newly posted
- * callbacks from any of the CPUs it is responsible for, waits for a
- * grace period, then awakens all of the rcu_nocb_cb_kthread() instances
- * that then have callback-invocation work to do.
- */
-static int rcu_nocb_gp_kthread(void *arg)
-{
- struct rcu_data *rdp = arg;
-
- for (;;) {
- WRITE_ONCE(rdp->nocb_gp_loops, rdp->nocb_gp_loops + 1);
- nocb_gp_wait(rdp);
- cond_resched_tasks_rcu_qs();
- }
- return 0;
-}
-
-static inline bool nocb_cb_can_run(struct rcu_data *rdp)
-{
- u8 flags = SEGCBLIST_OFFLOADED | SEGCBLIST_KTHREAD_CB;
- return rcu_segcblist_test_flags(&rdp->cblist, flags);
-}
-
-static inline bool nocb_cb_wait_cond(struct rcu_data *rdp)
-{
- return nocb_cb_can_run(rdp) && !READ_ONCE(rdp->nocb_cb_sleep);
-}
-
-/*
- * Invoke any ready callbacks from the corresponding no-CBs CPU,
- * then, if there are no more, wait for more to appear.
- */
-static void nocb_cb_wait(struct rcu_data *rdp)
-{
- struct rcu_segcblist *cblist = &rdp->cblist;
- unsigned long cur_gp_seq;
- unsigned long flags;
- bool needwake_state = false;
- bool needwake_gp = false;
- bool can_sleep = true;
- struct rcu_node *rnp = rdp->mynode;
-
- local_irq_save(flags);
- rcu_momentary_dyntick_idle();
- local_irq_restore(flags);
- /*
- * Disable BH to provide the expected environment. Also, when
- * transitioning to/from NOCB mode, a self-requeuing callback might
- * be invoked from softirq. A short grace period could cause both
- * instances of this callback would execute concurrently.
- */
- local_bh_disable();
- rcu_do_batch(rdp);
- local_bh_enable();
- lockdep_assert_irqs_enabled();
- rcu_nocb_lock_irqsave(rdp, flags);
- if (rcu_segcblist_nextgp(cblist, &cur_gp_seq) &&
- rcu_seq_done(&rnp->gp_seq, cur_gp_seq) &&
- raw_spin_trylock_rcu_node(rnp)) { /* irqs already disabled. */
- needwake_gp = rcu_advance_cbs(rdp->mynode, rdp);
- raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */
- }
-
- if (rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED)) {
- if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB)) {
- rcu_segcblist_set_flags(cblist, SEGCBLIST_KTHREAD_CB);
- if (rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP))
- needwake_state = true;
- }
- if (rcu_segcblist_ready_cbs(cblist))
- can_sleep = false;
- } else {
- /*
- * De-offloading. Clear our flag and notify the de-offload worker.
- * We won't touch the callbacks and keep sleeping until we ever
- * get re-offloaded.
- */
- WARN_ON_ONCE(!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB));
- rcu_segcblist_clear_flags(cblist, SEGCBLIST_KTHREAD_CB);
- if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP))
- needwake_state = true;
- }
-
- WRITE_ONCE(rdp->nocb_cb_sleep, can_sleep);
-
- if (rdp->nocb_cb_sleep)
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("CBSleep"));
-
- rcu_nocb_unlock_irqrestore(rdp, flags);
- if (needwake_gp)
- rcu_gp_kthread_wake();
-
- if (needwake_state)
- swake_up_one(&rdp->nocb_state_wq);
-
- do {
- swait_event_interruptible_exclusive(rdp->nocb_cb_wq,
- nocb_cb_wait_cond(rdp));
-
- // VVV Ensure CB invocation follows _sleep test.
- if (smp_load_acquire(&rdp->nocb_cb_sleep)) { // ^^^
- WARN_ON(signal_pending(current));
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeEmpty"));
- }
- } while (!nocb_cb_can_run(rdp));
-}
-
-/*
- * Per-rcu_data kthread, but only for no-CBs CPUs. Repeatedly invoke
- * nocb_cb_wait() to do the dirty work.
- */
-static int rcu_nocb_cb_kthread(void *arg)
-{
- struct rcu_data *rdp = arg;
-
- // Each pass through this loop does one callback batch, and,
- // if there are no more ready callbacks, waits for them.
- for (;;) {
- nocb_cb_wait(rdp);
- cond_resched_tasks_rcu_qs();
- }
- return 0;
-}
-
-/* Is a deferred wakeup of rcu_nocb_kthread() required? */
-static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp)
-{
- return READ_ONCE(rdp->nocb_defer_wakeup) > RCU_NOCB_WAKE_NOT;
-}
-
-/* Do a deferred wakeup of rcu_nocb_kthread(). */
-static bool do_nocb_deferred_wakeup_common(struct rcu_data *rdp)
-{
- unsigned long flags;
- int ndw;
- int ret;
-
- rcu_nocb_lock_irqsave(rdp, flags);
- if (!rcu_nocb_need_deferred_wakeup(rdp)) {
- rcu_nocb_unlock_irqrestore(rdp, flags);
- return false;
- }
- ndw = READ_ONCE(rdp->nocb_defer_wakeup);
- ret = wake_nocb_gp(rdp, ndw == RCU_NOCB_WAKE_FORCE, flags);
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DeferredWake"));
-
- return ret;
-}
-
-/* Do a deferred wakeup of rcu_nocb_kthread() from a timer handler. */
-static void do_nocb_deferred_wakeup_timer(struct timer_list *t)
-{
- struct rcu_data *rdp = from_timer(rdp, t, nocb_timer);
-
- do_nocb_deferred_wakeup_common(rdp);
-}
-
-/*
- * Do a deferred wakeup of rcu_nocb_kthread() from fastpath.
- * This means we do an inexact common-case check. Note that if
- * we miss, ->nocb_timer will eventually clean things up.
- */
-static bool do_nocb_deferred_wakeup(struct rcu_data *rdp)
-{
- if (rcu_nocb_need_deferred_wakeup(rdp))
- return do_nocb_deferred_wakeup_common(rdp);
- return false;
-}
-
-void rcu_nocb_flush_deferred_wakeup(void)
-{
- do_nocb_deferred_wakeup(this_cpu_ptr(&rcu_data));
-}
-EXPORT_SYMBOL_GPL(rcu_nocb_flush_deferred_wakeup);
-
-static int rdp_offload_toggle(struct rcu_data *rdp,
- bool offload, unsigned long flags)
- __releases(rdp->nocb_lock)
-{
- struct rcu_segcblist *cblist = &rdp->cblist;
- struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
- bool wake_gp = false;
-
- rcu_segcblist_offload(cblist, offload);
-
- if (rdp->nocb_cb_sleep)
- rdp->nocb_cb_sleep = false;
- rcu_nocb_unlock_irqrestore(rdp, flags);
-
- /*
- * Ignore former value of nocb_cb_sleep and force wake up as it could
- * have been spuriously set to false already.
- */
- swake_up_one(&rdp->nocb_cb_wq);
-
- raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags);
- if (rdp_gp->nocb_gp_sleep) {
- rdp_gp->nocb_gp_sleep = false;
- wake_gp = true;
- }
- raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags);
-
- if (wake_gp)
- wake_up_process(rdp_gp->nocb_gp_kthread);
-
- return 0;
-}
-
-static long rcu_nocb_rdp_deoffload(void *arg)
-{
- struct rcu_data *rdp = arg;
- struct rcu_segcblist *cblist = &rdp->cblist;
- unsigned long flags;
- int ret;
-
- WARN_ON_ONCE(rdp->cpu != raw_smp_processor_id());
-
- pr_info("De-offloading %d\n", rdp->cpu);
-
- rcu_nocb_lock_irqsave(rdp, flags);
- /*
- * Flush once and for all now. This suffices because we are
- * running on the target CPU holding ->nocb_lock (thus having
- * interrupts disabled), and because rdp_offload_toggle()
- * invokes rcu_segcblist_offload(), which clears SEGCBLIST_OFFLOADED.
- * Thus future calls to rcu_segcblist_completely_offloaded() will
- * return false, which means that future calls to rcu_nocb_try_bypass()
- * will refuse to put anything into the bypass.
- */
- WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies));
- ret = rdp_offload_toggle(rdp, false, flags);
- swait_event_exclusive(rdp->nocb_state_wq,
- !rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB |
- SEGCBLIST_KTHREAD_GP));
- rcu_nocb_lock_irqsave(rdp, flags);
- /* Make sure nocb timer won't stay around */
- WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_OFF);
- rcu_nocb_unlock_irqrestore(rdp, flags);
- del_timer_sync(&rdp->nocb_timer);
-
- /*
- * Theoretically we could set SEGCBLIST_SOFTIRQ_ONLY with CB unlocked
- * and IRQs disabled but let's be paranoid.
- */
- rcu_nocb_lock_irqsave(rdp, flags);
- rcu_segcblist_set_flags(cblist, SEGCBLIST_SOFTIRQ_ONLY);
- /*
- * With SEGCBLIST_SOFTIRQ_ONLY, we can't use
- * rcu_nocb_unlock_irqrestore() anymore.
- */
- raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
-
- /* Sanity check */
- WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass));
-
-
- return ret;
-}
-
-int rcu_nocb_cpu_deoffload(int cpu)
-{
- struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
- int ret = 0;
-
- if (rdp == rdp->nocb_gp_rdp) {
- pr_info("Can't deoffload an rdp GP leader (yet)\n");
- return -EINVAL;
- }
- mutex_lock(&rcu_state.barrier_mutex);
- cpus_read_lock();
- if (rcu_rdp_is_offloaded(rdp)) {
- if (cpu_online(cpu)) {
- ret = work_on_cpu(cpu, rcu_nocb_rdp_deoffload, rdp);
- if (!ret)
- cpumask_clear_cpu(cpu, rcu_nocb_mask);
- } else {
- pr_info("NOCB: Can't CB-deoffload an offline CPU\n");
- ret = -EINVAL;
- }
- }
- cpus_read_unlock();
- mutex_unlock(&rcu_state.barrier_mutex);
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(rcu_nocb_cpu_deoffload);
-
-static long rcu_nocb_rdp_offload(void *arg)
-{
- struct rcu_data *rdp = arg;
- struct rcu_segcblist *cblist = &rdp->cblist;
- unsigned long flags;
- int ret;
-
- WARN_ON_ONCE(rdp->cpu != raw_smp_processor_id());
- /*
- * For now we only support re-offload, ie: the rdp must have been
- * offloaded on boot first.
- */
- if (!rdp->nocb_gp_rdp)
- return -EINVAL;
-
- pr_info("Offloading %d\n", rdp->cpu);
- /*
- * Can't use rcu_nocb_lock_irqsave() while we are in
- * SEGCBLIST_SOFTIRQ_ONLY mode.
- */
- raw_spin_lock_irqsave(&rdp->nocb_lock, flags);
- /* Re-enable nocb timer */
- WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT);
- /*
- * We didn't take the nocb lock while working on the
- * rdp->cblist in SEGCBLIST_SOFTIRQ_ONLY mode.
- * Every modifications that have been done previously on
- * rdp->cblist must be visible remotely by the nocb kthreads
- * upon wake up after reading the cblist flags.
- *
- * The layout against nocb_lock enforces that ordering:
- *
- * __rcu_nocb_rdp_offload() nocb_cb_wait()/nocb_gp_wait()
- * ------------------------- ----------------------------
- * WRITE callbacks rcu_nocb_lock()
- * rcu_nocb_lock() READ flags
- * WRITE flags READ callbacks
- * rcu_nocb_unlock() rcu_nocb_unlock()
- */
- ret = rdp_offload_toggle(rdp, true, flags);
- swait_event_exclusive(rdp->nocb_state_wq,
- rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB) &&
- rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP));
-
- return ret;
-}
-
-int rcu_nocb_cpu_offload(int cpu)
-{
- struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
- int ret = 0;
-
- mutex_lock(&rcu_state.barrier_mutex);
- cpus_read_lock();
- if (!rcu_rdp_is_offloaded(rdp)) {
- if (cpu_online(cpu)) {
- ret = work_on_cpu(cpu, rcu_nocb_rdp_offload, rdp);
- if (!ret)
- cpumask_set_cpu(cpu, rcu_nocb_mask);
- } else {
- pr_info("NOCB: Can't CB-offload an offline CPU\n");
- ret = -EINVAL;
- }
- }
- cpus_read_unlock();
- mutex_unlock(&rcu_state.barrier_mutex);
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(rcu_nocb_cpu_offload);
-
-void __init rcu_init_nohz(void)
-{
- int cpu;
- bool need_rcu_nocb_mask = false;
- struct rcu_data *rdp;
-
-#if defined(CONFIG_NO_HZ_FULL)
- if (tick_nohz_full_running && cpumask_weight(tick_nohz_full_mask))
- need_rcu_nocb_mask = true;
-#endif /* #if defined(CONFIG_NO_HZ_FULL) */
-
- if (!cpumask_available(rcu_nocb_mask) && need_rcu_nocb_mask) {
- if (!zalloc_cpumask_var(&rcu_nocb_mask, GFP_KERNEL)) {
- pr_info("rcu_nocb_mask allocation failed, callback offloading disabled.\n");
- return;
- }
- }
- if (!cpumask_available(rcu_nocb_mask))
- return;
-
-#if defined(CONFIG_NO_HZ_FULL)
- if (tick_nohz_full_running)
- cpumask_or(rcu_nocb_mask, rcu_nocb_mask, tick_nohz_full_mask);
-#endif /* #if defined(CONFIG_NO_HZ_FULL) */
-
- if (!cpumask_subset(rcu_nocb_mask, cpu_possible_mask)) {
- pr_info("\tNote: kernel parameter 'rcu_nocbs=', 'nohz_full', or 'isolcpus=' contains nonexistent CPUs.\n");
- cpumask_and(rcu_nocb_mask, cpu_possible_mask,
- rcu_nocb_mask);
- }
- if (cpumask_empty(rcu_nocb_mask))
- pr_info("\tOffload RCU callbacks from CPUs: (none).\n");
- else
- pr_info("\tOffload RCU callbacks from CPUs: %*pbl.\n",
- cpumask_pr_args(rcu_nocb_mask));
- if (rcu_nocb_poll)
- pr_info("\tPoll for callbacks from no-CBs CPUs.\n");
-
- for_each_cpu(cpu, rcu_nocb_mask) {
- rdp = per_cpu_ptr(&rcu_data, cpu);
- if (rcu_segcblist_empty(&rdp->cblist))
- rcu_segcblist_init(&rdp->cblist);
- rcu_segcblist_offload(&rdp->cblist, true);
- rcu_segcblist_set_flags(&rdp->cblist, SEGCBLIST_KTHREAD_CB);
- rcu_segcblist_set_flags(&rdp->cblist, SEGCBLIST_KTHREAD_GP);
- }
- rcu_organize_nocb_kthreads();
-}
-
-/* Initialize per-rcu_data variables for no-CBs CPUs. */
-static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
-{
- init_swait_queue_head(&rdp->nocb_cb_wq);
- init_swait_queue_head(&rdp->nocb_gp_wq);
- init_swait_queue_head(&rdp->nocb_state_wq);
- raw_spin_lock_init(&rdp->nocb_lock);
- raw_spin_lock_init(&rdp->nocb_bypass_lock);
- raw_spin_lock_init(&rdp->nocb_gp_lock);
- timer_setup(&rdp->nocb_timer, do_nocb_deferred_wakeup_timer, 0);
- timer_setup(&rdp->nocb_bypass_timer, do_nocb_bypass_wakeup_timer, 0);
- rcu_cblist_init(&rdp->nocb_bypass);
-}
-
-/*
- * If the specified CPU is a no-CBs CPU that does not already have its
- * rcuo CB kthread, spawn it. Additionally, if the rcuo GP kthread
- * for this CPU's group has not yet been created, spawn it as well.
- */
-static void rcu_spawn_one_nocb_kthread(int cpu)
-{
- struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
- struct rcu_data *rdp_gp;
- struct task_struct *t;
-
- /*
- * If this isn't a no-CBs CPU or if it already has an rcuo kthread,
- * then nothing to do.
- */
- if (!rcu_is_nocb_cpu(cpu) || rdp->nocb_cb_kthread)
- return;
-
- /* If we didn't spawn the GP kthread first, reorganize! */
- rdp_gp = rdp->nocb_gp_rdp;
- if (!rdp_gp->nocb_gp_kthread) {
- t = kthread_run(rcu_nocb_gp_kthread, rdp_gp,
- "rcuog/%d", rdp_gp->cpu);
- if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo GP kthread, OOM is now expected behavior\n", __func__))
- return;
- WRITE_ONCE(rdp_gp->nocb_gp_kthread, t);
- }
-
- /* Spawn the kthread for this CPU. */
- t = kthread_run(rcu_nocb_cb_kthread, rdp,
- "rcuo%c/%d", rcu_state.abbr, cpu);
- if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo CB kthread, OOM is now expected behavior\n", __func__))
- return;
- WRITE_ONCE(rdp->nocb_cb_kthread, t);
- WRITE_ONCE(rdp->nocb_gp_kthread, rdp_gp->nocb_gp_kthread);
-}
-
-/*
- * If the specified CPU is a no-CBs CPU that does not already have its
- * rcuo kthread, spawn it.
- */
-static void rcu_spawn_cpu_nocb_kthread(int cpu)
-{
- if (rcu_scheduler_fully_active)
- rcu_spawn_one_nocb_kthread(cpu);
-}
-
-/*
- * Once the scheduler is running, spawn rcuo kthreads for all online
- * no-CBs CPUs. This assumes that the early_initcall()s happen before
- * non-boot CPUs come online -- if this changes, we will need to add
- * some mutual exclusion.
- */
-static void __init rcu_spawn_nocb_kthreads(void)
-{
- int cpu;
-
- for_each_online_cpu(cpu)
- rcu_spawn_cpu_nocb_kthread(cpu);
-}
-
-/* How many CB CPU IDs per GP kthread? Default of -1 for sqrt(nr_cpu_ids). */
-static int rcu_nocb_gp_stride = -1;
-module_param(rcu_nocb_gp_stride, int, 0444);
-
-/*
- * Initialize GP-CB relationships for all no-CBs CPU.
- */
-static void __init rcu_organize_nocb_kthreads(void)
-{
- int cpu;
- bool firsttime = true;
- bool gotnocbs = false;
- bool gotnocbscbs = true;
- int ls = rcu_nocb_gp_stride;
- int nl = 0; /* Next GP kthread. */
- struct rcu_data *rdp;
- struct rcu_data *rdp_gp = NULL; /* Suppress misguided gcc warn. */
- struct rcu_data *rdp_prev = NULL;
-
- if (!cpumask_available(rcu_nocb_mask))
- return;
- if (ls == -1) {
- ls = nr_cpu_ids / int_sqrt(nr_cpu_ids);
- rcu_nocb_gp_stride = ls;
- }
-
- /*
- * Each pass through this loop sets up one rcu_data structure.
- * Should the corresponding CPU come online in the future, then
- * we will spawn the needed set of rcu_nocb_kthread() kthreads.
- */
- for_each_cpu(cpu, rcu_nocb_mask) {
- rdp = per_cpu_ptr(&rcu_data, cpu);
- if (rdp->cpu >= nl) {
- /* New GP kthread, set up for CBs & next GP. */
- gotnocbs = true;
- nl = DIV_ROUND_UP(rdp->cpu + 1, ls) * ls;
- rdp->nocb_gp_rdp = rdp;
- rdp_gp = rdp;
- if (dump_tree) {
- if (!firsttime)
- pr_cont("%s\n", gotnocbscbs
- ? "" : " (self only)");
- gotnocbscbs = false;
- firsttime = false;
- pr_alert("%s: No-CB GP kthread CPU %d:",
- __func__, cpu);
- }
- } else {
- /* Another CB kthread, link to previous GP kthread. */
- gotnocbscbs = true;
- rdp->nocb_gp_rdp = rdp_gp;
- rdp_prev->nocb_next_cb_rdp = rdp;
- if (dump_tree)
- pr_cont(" %d", cpu);
- }
- rdp_prev = rdp;
- }
- if (gotnocbs && dump_tree)
- pr_cont("%s\n", gotnocbscbs ? "" : " (self only)");
-}
-
-/*
- * Bind the current task to the offloaded CPUs. If there are no offloaded
- * CPUs, leave the task unbound. Splat if the bind attempt fails.
- */
-void rcu_bind_current_to_nocb(void)
-{
- if (cpumask_available(rcu_nocb_mask) && cpumask_weight(rcu_nocb_mask))
- WARN_ON(sched_setaffinity(current->pid, rcu_nocb_mask));
-}
-EXPORT_SYMBOL_GPL(rcu_bind_current_to_nocb);
-
-// The ->on_cpu field is available only in CONFIG_SMP=y, so...
-#ifdef CONFIG_SMP
-static char *show_rcu_should_be_on_cpu(struct task_struct *tsp)
-{
- return tsp && tsp->state == TASK_RUNNING && !tsp->on_cpu ? "!" : "";
-}
-#else // #ifdef CONFIG_SMP
-static char *show_rcu_should_be_on_cpu(struct task_struct *tsp)
-{
- return "";
-}
-#endif // #else #ifdef CONFIG_SMP
-
-/*
- * Dump out nocb grace-period kthread state for the specified rcu_data
- * structure.
- */
-static void show_rcu_nocb_gp_state(struct rcu_data *rdp)
-{
- struct rcu_node *rnp = rdp->mynode;
-
- pr_info("nocb GP %d %c%c%c%c%c%c %c[%c%c] %c%c:%ld rnp %d:%d %lu %c CPU %d%s\n",
- rdp->cpu,
- "kK"[!!rdp->nocb_gp_kthread],
- "lL"[raw_spin_is_locked(&rdp->nocb_gp_lock)],
- "dD"[!!rdp->nocb_defer_wakeup],
- "tT"[timer_pending(&rdp->nocb_timer)],
- "bB"[timer_pending(&rdp->nocb_bypass_timer)],
- "sS"[!!rdp->nocb_gp_sleep],
- ".W"[swait_active(&rdp->nocb_gp_wq)],
- ".W"[swait_active(&rnp->nocb_gp_wq[0])],
- ".W"[swait_active(&rnp->nocb_gp_wq[1])],
- ".B"[!!rdp->nocb_gp_bypass],
- ".G"[!!rdp->nocb_gp_gp],
- (long)rdp->nocb_gp_seq,
- rnp->grplo, rnp->grphi, READ_ONCE(rdp->nocb_gp_loops),
- rdp->nocb_gp_kthread ? task_state_to_char(rdp->nocb_gp_kthread) : '.',
- rdp->nocb_cb_kthread ? (int)task_cpu(rdp->nocb_gp_kthread) : -1,
- show_rcu_should_be_on_cpu(rdp->nocb_cb_kthread));
-}
-
-/* Dump out nocb kthread state for the specified rcu_data structure. */
-static void show_rcu_nocb_state(struct rcu_data *rdp)
-{
- char bufw[20];
- char bufr[20];
- struct rcu_segcblist *rsclp = &rdp->cblist;
- bool waslocked;
- bool wastimer;
- bool wassleep;
-
- if (rdp->nocb_gp_rdp == rdp)
- show_rcu_nocb_gp_state(rdp);
-
- sprintf(bufw, "%ld", rsclp->gp_seq[RCU_WAIT_TAIL]);
- sprintf(bufr, "%ld", rsclp->gp_seq[RCU_NEXT_READY_TAIL]);
- pr_info(" CB %d^%d->%d %c%c%c%c%c%c F%ld L%ld C%d %c%c%s%c%s%c%c q%ld %c CPU %d%s\n",
- rdp->cpu, rdp->nocb_gp_rdp->cpu,
- rdp->nocb_next_cb_rdp ? rdp->nocb_next_cb_rdp->cpu : -1,
- "kK"[!!rdp->nocb_cb_kthread],
- "bB"[raw_spin_is_locked(&rdp->nocb_bypass_lock)],
- "cC"[!!atomic_read(&rdp->nocb_lock_contended)],
- "lL"[raw_spin_is_locked(&rdp->nocb_lock)],
- "sS"[!!rdp->nocb_cb_sleep],
- ".W"[swait_active(&rdp->nocb_cb_wq)],
- jiffies - rdp->nocb_bypass_first,
- jiffies - rdp->nocb_nobypass_last,
- rdp->nocb_nobypass_count,
- ".D"[rcu_segcblist_ready_cbs(rsclp)],
- ".W"[!rcu_segcblist_segempty(rsclp, RCU_WAIT_TAIL)],
- rcu_segcblist_segempty(rsclp, RCU_WAIT_TAIL) ? "" : bufw,
- ".R"[!rcu_segcblist_segempty(rsclp, RCU_NEXT_READY_TAIL)],
- rcu_segcblist_segempty(rsclp, RCU_NEXT_READY_TAIL) ? "" : bufr,
- ".N"[!rcu_segcblist_segempty(rsclp, RCU_NEXT_TAIL)],
- ".B"[!!rcu_cblist_n_cbs(&rdp->nocb_bypass)],
- rcu_segcblist_n_cbs(&rdp->cblist),
- rdp->nocb_cb_kthread ? task_state_to_char(rdp->nocb_cb_kthread) : '.',
- rdp->nocb_cb_kthread ? (int)task_cpu(rdp->nocb_gp_kthread) : -1,
- show_rcu_should_be_on_cpu(rdp->nocb_cb_kthread));
-
- /* It is OK for GP kthreads to have GP state. */
- if (rdp->nocb_gp_rdp == rdp)
- return;
-
- waslocked = raw_spin_is_locked(&rdp->nocb_gp_lock);
- wastimer = timer_pending(&rdp->nocb_bypass_timer);
- wassleep = swait_active(&rdp->nocb_gp_wq);
- if (!rdp->nocb_gp_sleep && !waslocked && !wastimer && !wassleep)
- return; /* Nothing untowards. */
-
- pr_info(" nocb GP activity on CB-only CPU!!! %c%c%c%c %c\n",
- "lL"[waslocked],
- "dD"[!!rdp->nocb_defer_wakeup],
- "tT"[wastimer],
- "sS"[!!rdp->nocb_gp_sleep],
- ".W"[wassleep]);
-}
-
-#else /* #ifdef CONFIG_RCU_NOCB_CPU */
-
-/* No ->nocb_lock to acquire. */
-static void rcu_nocb_lock(struct rcu_data *rdp)
-{
-}
-
-/* No ->nocb_lock to release. */
-static void rcu_nocb_unlock(struct rcu_data *rdp)
-{
-}
-
-/* No ->nocb_lock to release. */
-static void rcu_nocb_unlock_irqrestore(struct rcu_data *rdp,
- unsigned long flags)
-{
- local_irq_restore(flags);
-}
-
-/* Lockdep check that ->cblist may be safely accessed. */
-static void rcu_lockdep_assert_cblist_protected(struct rcu_data *rdp)
-{
- lockdep_assert_irqs_disabled();
-}
-
-static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq)
-{
-}
-
-static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp)
-{
- return NULL;
-}
-
-static void rcu_init_one_nocb(struct rcu_node *rnp)
-{
-}
-
-static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
- unsigned long j)
-{
- return true;
-}
-
-static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
- bool *was_alldone, unsigned long flags)
-{
- return false;
-}
-
-static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_empty,
- unsigned long flags)
-{
- WARN_ON_ONCE(1); /* Should be dead code! */
-}
-
-static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
-{
-}
-
-static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp)
-{
- return false;
-}
-
-static bool do_nocb_deferred_wakeup(struct rcu_data *rdp)
-{
- return false;
-}
-
-static void rcu_spawn_cpu_nocb_kthread(int cpu)
-{
-}
-
-static void __init rcu_spawn_nocb_kthreads(void)
-{
-}
-
-static void show_rcu_nocb_state(struct rcu_data *rdp)
-{
-}
-
-#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
-
/*
* Is this CPU a NO_HZ_FULL CPU that should ignore RCU so that the
* grace-period kthread will do force_quiescent_state() processing?
diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index 59b95cc5cbdf1..24065f1acb8bf 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -7,6 +7,8 @@
* Author: Paul E. McKenney <paulmck@linux.ibm.com>
*/
+#include <linux/kvm_para.h>
+
//////////////////////////////////////////////////////////////////////////////
//
// Controlling CPU stall warnings, including delay calculation.
@@ -117,17 +119,14 @@ static void panic_on_rcu_stall(void)
}
/**
- * rcu_cpu_stall_reset - prevent further stall warnings in current grace period
- *
- * Set the stall-warning timeout way off into the future, thus preventing
- * any RCU CPU stall-warning messages from appearing in the current set of
- * RCU grace periods.
+ * rcu_cpu_stall_reset - restart stall-warning timeout for current grace period
*
* The caller must disable hard irqs.
*/
void rcu_cpu_stall_reset(void)
{
- WRITE_ONCE(rcu_state.jiffies_stall, jiffies + ULONG_MAX / 2);
+ WRITE_ONCE(rcu_state.jiffies_stall,
+ jiffies + rcu_jiffies_till_stall_check());
}
//////////////////////////////////////////////////////////////////////////////
@@ -267,8 +266,10 @@ static int rcu_print_task_stall(struct rcu_node *rnp, unsigned long flags)
struct task_struct *ts[8];
lockdep_assert_irqs_disabled();
- if (!rcu_preempt_blocked_readers_cgp(rnp))
+ if (!rcu_preempt_blocked_readers_cgp(rnp)) {
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
return 0;
+ }
pr_err("\tTasks blocked on level-%d rcu_node (CPUs %d-%d):",
rnp->level, rnp->grplo, rnp->grphi);
t = list_entry(rnp->gp_tasks->prev,
@@ -280,8 +281,8 @@ static int rcu_print_task_stall(struct rcu_node *rnp, unsigned long flags)
break;
}
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
- for (i--; i; i--) {
- t = ts[i];
+ while (i) {
+ t = ts[--i];
if (!try_invoke_on_locked_down_task(t, check_slow_task, &rscr))
pr_cont(" P%d", t->pid);
else
@@ -314,6 +315,7 @@ static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp)
* tasks blocked within RCU read-side critical sections.
*/
static int rcu_print_task_stall(struct rcu_node *rnp, unsigned long flags)
+ __releases(rnp->lock)
{
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
return 0;
@@ -463,9 +465,10 @@ static void rcu_check_gp_kthread_starvation(void)
pr_err("%s kthread starved for %ld jiffies! g%ld f%#x %s(%d) ->state=%#lx ->cpu=%d\n",
rcu_state.name, j,
(long)rcu_seq_current(&rcu_state.gp_seq),
- data_race(rcu_state.gp_flags),
- gp_state_getname(rcu_state.gp_state), rcu_state.gp_state,
- gpk ? gpk->state : ~0, cpu);
+ data_race(READ_ONCE(rcu_state.gp_flags)),
+ gp_state_getname(rcu_state.gp_state),
+ data_race(READ_ONCE(rcu_state.gp_state)),
+ gpk ? data_race(READ_ONCE(gpk->state)) : ~0, cpu);
if (gpk) {
pr_err("\tUnless %s kthread gets sufficient CPU time, OOM is now expected behavior.\n", rcu_state.name);
pr_err("RCU grace-period kthread stack dump:\n");
@@ -508,7 +511,7 @@ static void rcu_check_gp_kthread_expired_fqs_timer(void)
(long)rcu_seq_current(&rcu_state.gp_seq),
data_race(rcu_state.gp_flags),
gp_state_getname(RCU_GP_WAIT_FQS), RCU_GP_WAIT_FQS,
- gpk->state);
+ data_race(READ_ONCE(gpk->state)));
pr_err("\tPossible timer handling issue on cpu=%d timer-softirq=%u\n",
cpu, kstat_softirqs_cpu(TIMER_SOFTIRQ, cpu));
}
@@ -567,11 +570,11 @@ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps)
pr_err("INFO: Stall ended before state dump start\n");
} else {
j = jiffies;
- gpa = data_race(rcu_state.gp_activity);
+ gpa = data_race(READ_ONCE(rcu_state.gp_activity));
pr_err("All QSes seen, last %s kthread activity %ld (%ld-%ld), jiffies_till_next_fqs=%ld, root ->qsmask %#lx\n",
rcu_state.name, j - gpa, j, gpa,
- data_race(jiffies_till_next_fqs),
- rcu_get_root()->qsmask);
+ data_race(READ_ONCE(jiffies_till_next_fqs)),
+ data_race(READ_ONCE(rcu_get_root()->qsmask)));
}
}
/* Rewrite if needed in case of slow consoles. */
@@ -645,6 +648,7 @@ static void print_cpu_stall(unsigned long gps)
static void check_cpu_stall(struct rcu_data *rdp)
{
+ bool didstall = false;
unsigned long gs1;
unsigned long gs2;
unsigned long gps;
@@ -690,24 +694,46 @@ static void check_cpu_stall(struct rcu_data *rdp)
ULONG_CMP_GE(gps, js))
return; /* No stall or GP completed since entering function. */
rnp = rdp->mynode;
- jn = jiffies + 3 * rcu_jiffies_till_stall_check() + 3;
+ jn = jiffies + ULONG_MAX / 2;
if (rcu_gp_in_progress() &&
(READ_ONCE(rnp->qsmask) & rdp->grpmask) &&
cmpxchg(&rcu_state.jiffies_stall, js, jn) == js) {
+ /*
+ * If a virtual machine is stopped by the host it can look to
+ * the watchdog like an RCU stall. Check to see if the host
+ * stopped the vm.
+ */
+ if (kvm_check_and_clear_guest_paused())
+ return;
+
/* We haven't checked in, so go dump stack. */
print_cpu_stall(gps);
if (READ_ONCE(rcu_cpu_stall_ftrace_dump))
rcu_ftrace_dump(DUMP_ALL);
+ didstall = true;
} else if (rcu_gp_in_progress() &&
ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY) &&
cmpxchg(&rcu_state.jiffies_stall, js, jn) == js) {
+ /*
+ * If a virtual machine is stopped by the host it can look to
+ * the watchdog like an RCU stall. Check to see if the host
+ * stopped the vm.
+ */
+ if (kvm_check_and_clear_guest_paused())
+ return;
+
/* They had a few time units to dump stack, so complain. */
print_other_cpu_stall(gs2, gps);
if (READ_ONCE(rcu_cpu_stall_ftrace_dump))
rcu_ftrace_dump(DUMP_ALL);
+ didstall = true;
+ }
+ if (didstall && READ_ONCE(rcu_state.jiffies_stall) == jn) {
+ jn = jiffies + 3 * rcu_jiffies_till_stall_check() + 3;
+ WRITE_ONCE(rcu_state.jiffies_stall, jn);
}
}
@@ -717,6 +743,63 @@ static void check_cpu_stall(struct rcu_data *rdp)
/*
+ * Check to see if a failure to end RCU priority inversion was due to
+ * a CPU not passing through a quiescent state. When this happens, there
+ * is nothing that RCU priority boosting can do to help, so we shouldn't
+ * count this as an RCU priority boosting failure. A return of true says
+ * RCU priority boosting is to blame, and false says otherwise. If false
+ * is returned, the first of the CPUs to blame is stored through cpup.
+ * If there was no CPU blocking the current grace period, but also nothing
+ * in need of being boosted, *cpup is set to -1. This can happen in case
+ * of vCPU preemption while the last CPU is reporting its quiscent state,
+ * for example.
+ *
+ * If cpup is NULL, then a lockless quick check is carried out, suitable
+ * for high-rate usage. On the other hand, if cpup is non-NULL, each
+ * rcu_node structure's ->lock is acquired, ruling out high-rate usage.
+ */
+bool rcu_check_boost_fail(unsigned long gp_state, int *cpup)
+{
+ bool atb = false;
+ int cpu;
+ unsigned long flags;
+ struct rcu_node *rnp;
+
+ rcu_for_each_leaf_node(rnp) {
+ if (!cpup) {
+ if (data_race(READ_ONCE(rnp->qsmask))) {
+ return false;
+ } else {
+ if (READ_ONCE(rnp->gp_tasks))
+ atb = true;
+ continue;
+ }
+ }
+ *cpup = -1;
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ if (rnp->gp_tasks)
+ atb = true;
+ if (!rnp->qsmask) {
+ // No CPUs without quiescent states for this rnp.
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ continue;
+ }
+ // Find the first holdout CPU.
+ for_each_leaf_node_possible_cpu(rnp, cpu) {
+ if (rnp->qsmask & (1UL << (cpu - rnp->grplo))) {
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ *cpup = cpu;
+ return false;
+ }
+ }
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ }
+ // Can't blame CPUs, so must blame RCU priority boosting.
+ return atb;
+}
+EXPORT_SYMBOL_GPL(rcu_check_boost_fail);
+
+/*
* Show the state of the grace-period kthreads.
*/
void show_rcu_gp_kthreads(void)
@@ -726,29 +809,41 @@ void show_rcu_gp_kthreads(void)
unsigned long j;
unsigned long ja;
unsigned long jr;
+ unsigned long js;
unsigned long jw;
struct rcu_data *rdp;
struct rcu_node *rnp;
struct task_struct *t = READ_ONCE(rcu_state.gp_kthread);
j = jiffies;
- ja = j - data_race(rcu_state.gp_activity);
- jr = j - data_race(rcu_state.gp_req_activity);
- jw = j - data_race(rcu_state.gp_wake_time);
- pr_info("%s: wait state: %s(%d) ->state: %#lx delta ->gp_activity %lu ->gp_req_activity %lu ->gp_wake_time %lu ->gp_wake_seq %ld ->gp_seq %ld ->gp_seq_needed %ld ->gp_flags %#x\n",
+ ja = j - data_race(READ_ONCE(rcu_state.gp_activity));
+ jr = j - data_race(READ_ONCE(rcu_state.gp_req_activity));
+ js = j - data_race(READ_ONCE(rcu_state.gp_start));
+ jw = j - data_race(READ_ONCE(rcu_state.gp_wake_time));
+ pr_info("%s: wait state: %s(%d) ->state: %#lx ->rt_priority %u delta ->gp_start %lu ->gp_activity %lu ->gp_req_activity %lu ->gp_wake_time %lu ->gp_wake_seq %ld ->gp_seq %ld ->gp_seq_needed %ld ->gp_max %lu ->gp_flags %#x\n",
rcu_state.name, gp_state_getname(rcu_state.gp_state),
- rcu_state.gp_state, t ? t->state : 0x1ffffL,
- ja, jr, jw, (long)data_race(rcu_state.gp_wake_seq),
- (long)data_race(rcu_state.gp_seq),
- (long)data_race(rcu_get_root()->gp_seq_needed),
- data_race(rcu_state.gp_flags));
+ data_race(READ_ONCE(rcu_state.gp_state)),
+ t ? data_race(READ_ONCE(t->state)) : 0x1ffffL, t ? t->rt_priority : 0xffU,
+ js, ja, jr, jw, (long)data_race(READ_ONCE(rcu_state.gp_wake_seq)),
+ (long)data_race(READ_ONCE(rcu_state.gp_seq)),
+ (long)data_race(READ_ONCE(rcu_get_root()->gp_seq_needed)),
+ data_race(READ_ONCE(rcu_state.gp_max)),
+ data_race(READ_ONCE(rcu_state.gp_flags)));
rcu_for_each_node_breadth_first(rnp) {
- if (ULONG_CMP_GE(READ_ONCE(rcu_state.gp_seq),
- READ_ONCE(rnp->gp_seq_needed)))
+ if (ULONG_CMP_GE(READ_ONCE(rcu_state.gp_seq), READ_ONCE(rnp->gp_seq_needed)) &&
+ !data_race(READ_ONCE(rnp->qsmask)) && !data_race(READ_ONCE(rnp->boost_tasks)) &&
+ !data_race(READ_ONCE(rnp->exp_tasks)) && !data_race(READ_ONCE(rnp->gp_tasks)))
continue;
- pr_info("\trcu_node %d:%d ->gp_seq %ld ->gp_seq_needed %ld\n",
- rnp->grplo, rnp->grphi, (long)data_race(rnp->gp_seq),
- (long)data_race(rnp->gp_seq_needed));
+ pr_info("\trcu_node %d:%d ->gp_seq %ld ->gp_seq_needed %ld ->qsmask %#lx %c%c%c%c ->n_boosts %ld\n",
+ rnp->grplo, rnp->grphi,
+ (long)data_race(READ_ONCE(rnp->gp_seq)),
+ (long)data_race(READ_ONCE(rnp->gp_seq_needed)),
+ data_race(READ_ONCE(rnp->qsmask)),
+ ".b"[!!data_race(READ_ONCE(rnp->boost_kthread_task))],
+ ".B"[!!data_race(READ_ONCE(rnp->boost_tasks))],
+ ".E"[!!data_race(READ_ONCE(rnp->exp_tasks))],
+ ".G"[!!data_race(READ_ONCE(rnp->gp_tasks))],
+ data_race(READ_ONCE(rnp->n_boosts)));
if (!rcu_is_leaf_node(rnp))
continue;
for_each_leaf_node_possible_cpu(rnp, cpu) {
@@ -758,12 +853,12 @@ void show_rcu_gp_kthreads(void)
READ_ONCE(rdp->gp_seq_needed)))
continue;
pr_info("\tcpu %d ->gp_seq_needed %ld\n",
- cpu, (long)data_race(rdp->gp_seq_needed));
+ cpu, (long)data_race(READ_ONCE(rdp->gp_seq_needed)));
}
}
for_each_possible_cpu(cpu) {
rdp = per_cpu_ptr(&rcu_data, cpu);
- cbs += data_race(rdp->n_cbs_invoked);
+ cbs += data_race(READ_ONCE(rdp->n_cbs_invoked));
if (rcu_segcblist_is_offloaded(&rdp->cblist))
show_rcu_nocb_state(rdp);
}
@@ -845,11 +940,11 @@ void rcu_fwd_progress_check(unsigned long j)
if (rcu_gp_in_progress()) {
pr_info("%s: GP age %lu jiffies\n",
- __func__, jiffies - rcu_state.gp_start);
+ __func__, jiffies - data_race(READ_ONCE(rcu_state.gp_start)));
show_rcu_gp_kthreads();
} else {
pr_info("%s: Last GP end %lu jiffies ago\n",
- __func__, jiffies - rcu_state.gp_end);
+ __func__, jiffies - data_race(READ_ONCE(rcu_state.gp_end)));
preempt_disable();
rdp = this_cpu_ptr(&rcu_data);
rcu_check_gp_start_stall(rdp->mynode, rdp, j);
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index b95ae86c40a7d..c21b38cc25e92 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -277,7 +277,7 @@ EXPORT_SYMBOL_GPL(rcu_callback_map);
noinstr int notrace debug_lockdep_rcu_enabled(void)
{
- return rcu_scheduler_active != RCU_SCHEDULER_INACTIVE && debug_locks &&
+ return rcu_scheduler_active != RCU_SCHEDULER_INACTIVE && READ_ONCE(debug_locks) &&
current->lockdep_recursion == 0;
}
EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled);
@@ -524,6 +524,7 @@ static void test_callback(struct rcu_head *r)
}
DEFINE_STATIC_SRCU(early_srcu);
+static unsigned long early_srcu_cookie;
struct early_boot_kfree_rcu {
struct rcu_head rh;
@@ -536,8 +537,10 @@ static void early_boot_test_call_rcu(void)
struct early_boot_kfree_rcu *rhp;
call_rcu(&head, test_callback);
- if (IS_ENABLED(CONFIG_SRCU))
+ if (IS_ENABLED(CONFIG_SRCU)) {
+ early_srcu_cookie = start_poll_synchronize_srcu(&early_srcu);
call_srcu(&early_srcu, &shead, test_callback);
+ }
rhp = kmalloc(sizeof(*rhp), GFP_KERNEL);
if (!WARN_ON_ONCE(!rhp))
kfree_rcu(rhp, rh);
@@ -563,6 +566,7 @@ static int rcu_verify_early_boot_tests(void)
if (IS_ENABLED(CONFIG_SRCU)) {
early_boot_test_counter++;
srcu_barrier(&early_srcu);
+ WARN_ON_ONCE(!poll_state_synchronize_srcu(&early_srcu, early_srcu_cookie));
}
}
if (rcu_self_test_counter != early_boot_test_counter) {
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index 1fb1c1ef6a19b..1ed85b25b0968 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -21,3 +21,4 @@ obj-$(CONFIG_HAVE_GENERIC_VDSO) += vsyscall.o
obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o
obj-$(CONFIG_TEST_UDELAY) += test_udelay.o
obj-$(CONFIG_TIME_NS) += namespace.o
+obj-$(CONFIG_TEST_CLOCKSOURCE_WATCHDOG) += clocksource-wdtest.o
diff --git a/kernel/time/clocksource-wdtest.c b/kernel/time/clocksource-wdtest.c
new file mode 100644
index 0000000000000..01df12395c0ee
--- /dev/null
+++ b/kernel/time/clocksource-wdtest.c
@@ -0,0 +1,202 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Unit test for the clocksource watchdog.
+ *
+ * Copyright (C) 2021 Facebook, Inc.
+ *
+ * Author: Paul E. McKenney <paulmck@kernel.org>
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/device.h>
+#include <linux/clocksource.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */
+#include <linux/tick.h>
+#include <linux/kthread.h>
+#include <linux/delay.h>
+#include <linux/prandom.h>
+#include <linux/cpu.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Paul E. McKenney <paulmck@kernel.org>");
+
+static int holdoff = IS_BUILTIN(CONFIG_TEST_CLOCKSOURCE_WATCHDOG) ? 10 : 0;
+module_param(holdoff, int, 0444);
+MODULE_PARM_DESC(holdoff, "Time to wait to start test (s).");
+
+/* Watchdog kthread's task_struct pointer for debug purposes. */
+static struct task_struct *wdtest_task;
+
+static u64 wdtest_jiffies_read(struct clocksource *cs)
+{
+ return (u64)jiffies;
+}
+
+/* Assume HZ > 100. */
+#define JIFFIES_SHIFT 8
+
+static struct clocksource clocksource_wdtest_jiffies = {
+ .name = "wdtest-jiffies",
+ .rating = 1, /* lowest valid rating*/
+ .uncertainty_margin = TICK_NSEC,
+ .read = wdtest_jiffies_read,
+ .mask = CLOCKSOURCE_MASK(32),
+ .flags = CLOCK_SOURCE_MUST_VERIFY,
+ .mult = TICK_NSEC << JIFFIES_SHIFT, /* details above */
+ .shift = JIFFIES_SHIFT,
+ .max_cycles = 10,
+};
+
+static int wdtest_ktime_read_ndelays;
+static bool wdtest_ktime_read_fuzz;
+
+static u64 wdtest_ktime_read(struct clocksource *cs)
+{
+ int wkrn = READ_ONCE(wdtest_ktime_read_ndelays);
+ static int sign = 1;
+ u64 ret;
+
+ if (wkrn) {
+ udelay(cs->uncertainty_margin / 250);
+ WRITE_ONCE(wdtest_ktime_read_ndelays, wkrn - 1);
+ }
+ ret = ktime_get_real_fast_ns();
+ if (READ_ONCE(wdtest_ktime_read_fuzz)) {
+ sign = -sign;
+ ret = ret + sign * 100 * NSEC_PER_MSEC;
+ }
+ return ret;
+}
+
+static void wdtest_ktime_cs_mark_unstable(struct clocksource *cs)
+{
+ pr_info("--- Marking %s unstable due to clocksource watchdog.\n", cs->name);
+}
+
+#define KTIME_FLAGS (CLOCK_SOURCE_IS_CONTINUOUS | \
+ CLOCK_SOURCE_VALID_FOR_HRES | \
+ CLOCK_SOURCE_MUST_VERIFY | \
+ CLOCK_SOURCE_VERIFY_PERCPU)
+
+static struct clocksource clocksource_wdtest_ktime = {
+ .name = "wdtest-ktime",
+ .rating = 300,
+ .read = wdtest_ktime_read,
+ .mask = CLOCKSOURCE_MASK(64),
+ .flags = KTIME_FLAGS,
+ .mark_unstable = wdtest_ktime_cs_mark_unstable,
+ .list = LIST_HEAD_INIT(clocksource_wdtest_ktime.list),
+};
+
+/* Reset the clocksource if needed. */
+static void wdtest_ktime_clocksource_reset(void)
+{
+ if (clocksource_wdtest_ktime.flags & CLOCK_SOURCE_UNSTABLE) {
+ clocksource_unregister(&clocksource_wdtest_ktime);
+ clocksource_wdtest_ktime.flags = KTIME_FLAGS;
+ schedule_timeout_uninterruptible(HZ / 10);
+ clocksource_register_khz(&clocksource_wdtest_ktime, 1000 * 1000);
+ }
+}
+
+/* Run the specified series of watchdog tests. */
+static int wdtest_func(void *arg)
+{
+ unsigned long j1, j2;
+ char *s;
+ int i;
+
+ schedule_timeout_uninterruptible(holdoff * HZ);
+
+ /*
+ * Verify that jiffies-like clocksources get the manually
+ * specified uncertainty margin.
+ */
+ pr_info("--- Verify jiffies-like uncertainty margin.\n");
+ __clocksource_register(&clocksource_wdtest_jiffies);
+ WARN_ON_ONCE(clocksource_wdtest_jiffies.uncertainty_margin != TICK_NSEC);
+
+ j1 = clocksource_wdtest_jiffies.read(&clocksource_wdtest_jiffies);
+ schedule_timeout_uninterruptible(HZ);
+ j2 = clocksource_wdtest_jiffies.read(&clocksource_wdtest_jiffies);
+ WARN_ON_ONCE(j1 == j2);
+
+ clocksource_unregister(&clocksource_wdtest_jiffies);
+
+ /*
+ * Verify that tsc-like clocksources are assigned a reasonable
+ * uncertainty margin.
+ */
+ pr_info("--- Verify tsc-like uncertainty margin.\n");
+ clocksource_register_khz(&clocksource_wdtest_ktime, 1000 * 1000);
+ WARN_ON_ONCE(clocksource_wdtest_ktime.uncertainty_margin < NSEC_PER_USEC);
+
+ j1 = clocksource_wdtest_ktime.read(&clocksource_wdtest_ktime);
+ udelay(1);
+ j2 = clocksource_wdtest_ktime.read(&clocksource_wdtest_ktime);
+ pr_info("--- tsc-like times: %lu - %lu = %lu.\n", j2, j1, j2 - j1);
+ WARN_ON_ONCE(time_before(j2, j1 + NSEC_PER_USEC));
+
+ /* Verify tsc-like stability with various numbers of errors injected. */
+ for (i = 0; i <= max_cswd_read_retries + 1; i++) {
+ if (i <= 1 && i < max_cswd_read_retries)
+ s = "";
+ else if (i <= max_cswd_read_retries)
+ s = ", expect message";
+ else
+ s = ", expect clock skew";
+ pr_info("--- Watchdog with %dx error injection, %lu retries%s.\n", i, max_cswd_read_retries, s);
+ WRITE_ONCE(wdtest_ktime_read_ndelays, i);
+ schedule_timeout_uninterruptible(2 * HZ);
+ WARN_ON_ONCE(READ_ONCE(wdtest_ktime_read_ndelays));
+ WARN_ON_ONCE((i <= max_cswd_read_retries) !=
+ !(clocksource_wdtest_ktime.flags & CLOCK_SOURCE_UNSTABLE));
+ wdtest_ktime_clocksource_reset();
+ }
+
+ /* Verify tsc-like stability with clock-value-fuzz error injection. */
+ pr_info("--- Watchdog clock-value-fuzz error injection, expect clock skew and per-CPU mismatches.\n");
+ WRITE_ONCE(wdtest_ktime_read_fuzz, true);
+ schedule_timeout_uninterruptible(2 * HZ);
+ WARN_ON_ONCE(!(clocksource_wdtest_ktime.flags & CLOCK_SOURCE_UNSTABLE));
+ clocksource_verify_percpu(&clocksource_wdtest_ktime);
+ WRITE_ONCE(wdtest_ktime_read_fuzz, false);
+
+ clocksource_unregister(&clocksource_wdtest_ktime);
+
+ pr_info("--- Done with test.\n");
+ return 0;
+}
+
+static void wdtest_print_module_parms(void)
+{
+ pr_alert("--- holdoff=%d\n", holdoff);
+}
+
+/* Cleanup function. */
+static void clocksource_wdtest_cleanup(void)
+{
+}
+
+static int __init clocksource_wdtest_init(void)
+{
+ int ret = 0;
+
+ wdtest_print_module_parms();
+
+ /* Create watchdog-test task. */
+ wdtest_task = kthread_run(wdtest_func, NULL, "wdtest");
+ if (IS_ERR(wdtest_task)) {
+ ret = PTR_ERR(wdtest_task);
+ pr_warn("%s: Failed to create wdtest kthread.\n", __func__);
+ wdtest_task = NULL;
+ return ret;
+ }
+
+ return 0;
+}
+
+module_init(clocksource_wdtest_init);
+module_exit(clocksource_wdtest_cleanup);
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 2cd902592fc1f..4485635b69f5f 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -14,6 +14,8 @@
#include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */
#include <linux/tick.h>
#include <linux/kthread.h>
+#include <linux/prandom.h>
+#include <linux/cpu.h>
#include "tick-internal.h"
#include "timekeeping_internal.h"
@@ -93,6 +95,20 @@ static char override_name[CS_NAME_LEN];
static int finished_booting;
static u64 suspend_start;
+/*
+ * Threshold: 0.0312s, when doubled: 0.0625s.
+ * Also a default for cs->uncertainty_margin when registering clocks.
+ */
+#define WATCHDOG_THRESHOLD (NSEC_PER_SEC >> 5)
+
+/*
+ * Maximum permissible delay between two readouts of the watchdog
+ * clocksource surrounding a read of the clocksource being validated.
+ * This delay could be due to SMIs, NMIs, or to VCPU preemptions. Used as
+ * a lower bound for cs->uncertainty_margin values when registering clocks.
+ */
+#define WATCHDOG_MAX_SKEW (50 * NSEC_PER_USEC)
+
#ifdef CONFIG_CLOCKSOURCE_WATCHDOG
static void clocksource_watchdog_work(struct work_struct *work);
static void clocksource_select(void);
@@ -119,10 +135,9 @@ static int clocksource_watchdog_kthread(void *data);
static void __clocksource_change_rating(struct clocksource *cs, int rating);
/*
- * Interval: 0.5sec Threshold: 0.0625s
+ * Interval: 0.5sec.
*/
#define WATCHDOG_INTERVAL (HZ >> 1)
-#define WATCHDOG_THRESHOLD (NSEC_PER_SEC >> 4)
static void clocksource_watchdog_work(struct work_struct *work)
{
@@ -184,12 +199,163 @@ void clocksource_mark_unstable(struct clocksource *cs)
spin_unlock_irqrestore(&watchdog_lock, flags);
}
+ulong max_cswd_read_retries = 3;
+module_param(max_cswd_read_retries, ulong, 0644);
+EXPORT_SYMBOL_GPL(max_cswd_read_retries);
+static int verify_n_cpus = 8;
+module_param(verify_n_cpus, int, 0644);
+
+static bool cs_watchdog_read(struct clocksource *cs, u64 *csnow, u64 *wdnow)
+{
+ unsigned int nretries;
+ u64 wd_end, wd_delta;
+ int64_t wd_delay;
+
+ for (nretries = 0; nretries <= max_cswd_read_retries; nretries++) {
+ local_irq_disable();
+ *wdnow = watchdog->read(watchdog);
+ *csnow = cs->read(cs);
+ wd_end = watchdog->read(watchdog);
+ local_irq_enable();
+
+ wd_delta = clocksource_delta(wd_end, *wdnow, watchdog->mask);
+ wd_delay = clocksource_cyc2ns(wd_delta, watchdog->mult, watchdog->shift);
+ if (wd_delay <= WATCHDOG_MAX_SKEW) {
+ if (nretries > 1 || nretries >= max_cswd_read_retries) {
+ pr_warn("timekeeping watchdog on CPU%d: %s retried %d times before success\n",
+ smp_processor_id(), watchdog->name, nretries);
+ }
+ return true;
+ }
+ }
+
+ pr_warn("timekeeping watchdog on CPU%d: %s read-back delay of %lldns, attempt %d, marking unstable\n",
+ smp_processor_id(), watchdog->name, wd_delay, nretries);
+ return false;
+}
+
+static u64 csnow_mid;
+static cpumask_t cpus_ahead;
+static cpumask_t cpus_behind;
+static cpumask_t cpus_chosen;
+
+static void clocksource_verify_choose_cpus(void)
+{
+ int cpu, i, n = verify_n_cpus;
+
+ if (n < 0) {
+ /* Check all of the CPUs. */
+ cpumask_copy(&cpus_chosen, cpu_online_mask);
+ cpumask_clear_cpu(smp_processor_id(), &cpus_chosen);
+ return;
+ }
+
+ /* If no checking desired, or no other CPU to check, leave. */
+ cpumask_clear(&cpus_chosen);
+ if (n == 0 || num_online_cpus() <= 1)
+ return;
+
+ /* Make sure to select at least one CPU other than the current CPU. */
+ cpu = cpumask_next(-1, cpu_online_mask);
+ if (cpu == smp_processor_id())
+ cpu = cpumask_next(cpu, cpu_online_mask);
+ if (WARN_ON_ONCE(cpu >= nr_cpu_ids))
+ return;
+ cpumask_set_cpu(cpu, &cpus_chosen);
+
+ /* Force a sane value for the boot parameter. */
+ if (n > nr_cpu_ids)
+ n = nr_cpu_ids;
+
+ /*
+ * Randomly select the specified number of CPUs. If the same
+ * CPU is selected multiple times, that CPU is checked only once,
+ * and no replacement CPU is selected. This gracefully handles
+ * situations where verify_n_cpus is greater than the number of
+ * CPUs that are currently online.
+ */
+ for (i = 1; i < n; i++) {
+ cpu = prandom_u32() % nr_cpu_ids;
+ cpu = cpumask_next(cpu - 1, cpu_online_mask);
+ if (cpu >= nr_cpu_ids)
+ cpu = cpumask_next(-1, cpu_online_mask);
+ if (!WARN_ON_ONCE(cpu >= nr_cpu_ids))
+ cpumask_set_cpu(cpu, &cpus_chosen);
+ }
+
+ /* Don't verify ourselves. */
+ cpumask_clear_cpu(smp_processor_id(), &cpus_chosen);
+}
+
+static void clocksource_verify_one_cpu(void *csin)
+{
+ struct clocksource *cs = (struct clocksource *)csin;
+
+ csnow_mid = cs->read(cs);
+}
+
+void clocksource_verify_percpu(struct clocksource *cs)
+{
+ int64_t cs_nsec, cs_nsec_max = 0, cs_nsec_min = LLONG_MAX;
+ u64 csnow_begin, csnow_end;
+ int cpu, testcpu;
+ s64 delta;
+
+ if (verify_n_cpus == 0)
+ return;
+ cpumask_clear(&cpus_ahead);
+ cpumask_clear(&cpus_behind);
+ get_online_cpus();
+ preempt_disable();
+ clocksource_verify_choose_cpus();
+ if (cpumask_weight(&cpus_chosen) == 0) {
+ preempt_enable();
+ put_online_cpus();
+ pr_warn("Not enough CPUs to check clocksource '%s'.\n", cs->name);
+ return;
+ }
+ testcpu = smp_processor_id();
+ pr_warn("Checking clocksource %s synchronization from CPU %d to CPUs %*pbl.\n", cs->name, testcpu, cpumask_pr_args(&cpus_chosen));
+ for_each_cpu(cpu, &cpus_chosen) {
+ if (cpu == testcpu)
+ continue;
+ csnow_begin = cs->read(cs);
+ smp_call_function_single(cpu, clocksource_verify_one_cpu, cs, 1);
+ csnow_end = cs->read(cs);
+ delta = (s64)((csnow_mid - csnow_begin) & cs->mask);
+ if (delta < 0)
+ cpumask_set_cpu(cpu, &cpus_behind);
+ delta = (csnow_end - csnow_mid) & cs->mask;
+ if (delta < 0)
+ cpumask_set_cpu(cpu, &cpus_ahead);
+ delta = clocksource_delta(csnow_end, csnow_begin, cs->mask);
+ cs_nsec = clocksource_cyc2ns(delta, cs->mult, cs->shift);
+ if (cs_nsec > cs_nsec_max)
+ cs_nsec_max = cs_nsec;
+ if (cs_nsec < cs_nsec_min)
+ cs_nsec_min = cs_nsec;
+ }
+ preempt_enable();
+ put_online_cpus();
+ if (!cpumask_empty(&cpus_ahead))
+ pr_warn(" CPUs %*pbl ahead of CPU %d for clocksource %s.\n",
+ cpumask_pr_args(&cpus_ahead), testcpu, cs->name);
+ if (!cpumask_empty(&cpus_behind))
+ pr_warn(" CPUs %*pbl behind CPU %d for clocksource %s.\n",
+ cpumask_pr_args(&cpus_behind), testcpu, cs->name);
+ if (!cpumask_empty(&cpus_ahead) || !cpumask_empty(&cpus_behind))
+ pr_warn(" CPU %d check durations %lldns - %lldns for clocksource %s.\n",
+ testcpu, cs_nsec_min, cs_nsec_max, cs->name);
+}
+EXPORT_SYMBOL_GPL(clocksource_verify_percpu);
+
static void clocksource_watchdog(struct timer_list *unused)
{
- struct clocksource *cs;
u64 csnow, wdnow, cslast, wdlast, delta;
- int64_t wd_nsec, cs_nsec;
int next_cpu, reset_pending;
+ int64_t wd_nsec, cs_nsec;
+ struct clocksource *cs;
+ u32 md;
spin_lock(&watchdog_lock);
if (!watchdog_running)
@@ -206,10 +372,11 @@ static void clocksource_watchdog(struct timer_list *unused)
continue;
}
- local_irq_disable();
- csnow = cs->read(cs);
- wdnow = watchdog->read(watchdog);
- local_irq_enable();
+ if (!cs_watchdog_read(cs, &csnow, &wdnow)) {
+ /* Clock readout unreliable, so give it up. */
+ __clocksource_unstable(cs);
+ continue;
+ }
/* Clocksource initialized ? */
if (!(cs->flags & CLOCK_SOURCE_WATCHDOG) ||
@@ -235,13 +402,20 @@ static void clocksource_watchdog(struct timer_list *unused)
continue;
/* Check the deviation from the watchdog clocksource. */
- if (abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD) {
+ md = cs->uncertainty_margin + watchdog->uncertainty_margin;
+ if (abs(cs_nsec - wd_nsec) > md) {
pr_warn("timekeeping watchdog on CPU%d: Marking clocksource '%s' as unstable because the skew is too large:\n",
smp_processor_id(), cs->name);
- pr_warn(" '%s' wd_now: %llx wd_last: %llx mask: %llx\n",
- watchdog->name, wdnow, wdlast, watchdog->mask);
- pr_warn(" '%s' cs_now: %llx cs_last: %llx mask: %llx\n",
- cs->name, csnow, cslast, cs->mask);
+ pr_warn(" '%s' wd_nsec: %lld wd_now: %llx wd_last: %llx mask: %llx\n",
+ watchdog->name, wd_nsec, wdnow, wdlast, watchdog->mask);
+ pr_warn(" '%s' cs_nsec: %lld cs_now: %llx cs_last: %llx mask: %llx\n",
+ cs->name, cs_nsec, csnow, cslast, cs->mask);
+ if (curr_clocksource == cs)
+ pr_warn(" '%s' is current clocksource.\n", cs->name);
+ else if (curr_clocksource)
+ pr_warn(" '%s' (not '%s') is current clocksource.\n", curr_clocksource->name, cs->name);
+ else
+ pr_warn(" No current clocksource.\n");
__clocksource_unstable(cs);
continue;
}
@@ -407,6 +581,12 @@ static int __clocksource_watchdog_kthread(void)
unsigned long flags;
int select = 0;
+ /* Do any required per-CPU skew verification. */
+ if (curr_clocksource &&
+ curr_clocksource->flags & CLOCK_SOURCE_UNSTABLE &&
+ curr_clocksource->flags & CLOCK_SOURCE_VERIFY_PERCPU)
+ clocksource_verify_percpu(curr_clocksource);
+
spin_lock_irqsave(&watchdog_lock, flags);
list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) {
if (cs->flags & CLOCK_SOURCE_UNSTABLE) {
@@ -876,6 +1056,26 @@ void __clocksource_update_freq_scale(struct clocksource *cs, u32 scale, u32 freq
clocks_calc_mult_shift(&cs->mult, &cs->shift, freq,
NSEC_PER_SEC / scale, sec * scale);
}
+
+ /*
+ * If the uncertainty margin is not specified, calculate it.
+ * If both scale and freq are non-zero, calculate the clock
+ * period, but bound below at 2*WATCHDOG_MAX_SKEW. However,
+ * if either of scale or freq is zero, be very conservative and
+ * take the tens-of-milliseconds WATCHDOG_THRESHOLD value for the
+ * uncertainty margin. Allow stupidly small uncertainty margins
+ * to be specified by the caller for testing purposes, but warn
+ * to discourage production use of this capability.
+ */
+ if (scale && freq && !cs->uncertainty_margin) {
+ cs->uncertainty_margin = NSEC_PER_SEC / (scale * freq);
+ if (cs->uncertainty_margin < 2 * WATCHDOG_MAX_SKEW)
+ cs->uncertainty_margin = 2 * WATCHDOG_MAX_SKEW;
+ } else if (!cs->uncertainty_margin) {
+ cs->uncertainty_margin = WATCHDOG_THRESHOLD;
+ }
+ WARN_ON_ONCE(cs->uncertainty_margin < 2 * WATCHDOG_MAX_SKEW);
+
/*
* Ensure clocksources that have large 'mult' values don't overflow
* when adjusted.
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index a492e4da69ba2..01935aafdb460 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -49,13 +49,14 @@ static u64 jiffies_read(struct clocksource *cs)
* for "tick-less" systems.
*/
static struct clocksource clocksource_jiffies = {
- .name = "jiffies",
- .rating = 1, /* lowest valid rating*/
- .read = jiffies_read,
- .mask = CLOCKSOURCE_MASK(32),
- .mult = TICK_NSEC << JIFFIES_SHIFT, /* details above */
- .shift = JIFFIES_SHIFT,
- .max_cycles = 10,
+ .name = "jiffies",
+ .rating = 1, /* lowest valid rating*/
+ .uncertainty_margin = 32 * NSEC_PER_MSEC,
+ .read = jiffies_read,
+ .mask = CLOCKSOURCE_MASK(32),
+ .mult = TICK_NSEC << JIFFIES_SHIFT, /* details above */
+ .shift = JIFFIES_SHIFT,
+ .max_cycles = 10,
};
__cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(jiffies_lock);
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index d111adf4a0cb4..84332f01dc571 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1237,20 +1237,6 @@ int try_to_del_timer_sync(struct timer_list *timer)
}
EXPORT_SYMBOL(try_to_del_timer_sync);
-bool timer_curr_running(struct timer_list *timer)
-{
- int i;
-
- for (i = 0; i < NR_BASES; i++) {
- struct timer_base *base = this_cpu_ptr(&timer_bases[i]);
-
- if (base->running_timer == timer)
- return true;
- }
-
- return false;
-}
-
#ifdef CONFIG_PREEMPT_RT
static __init void timer_base_init_expiry_lock(struct timer_base *base)
{
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 313dca042e173..a6b68ee531605 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -2573,6 +2573,18 @@ config TEST_FPU
If unsure, say N.
+config TEST_CLOCKSOURCE_WATCHDOG
+ tristate "Test clocksource watchdog in kernel space"
+ depends on CLOCKSOURCE_WATCHDOG
+ help
+ Enable this option to create a kernel module that will trigger
+ a test of the clocksource watchdog. This module may be loaded
+ via modprobe or insmod in which case it will run upon being
+ loaded, or it may be built in, in which case it will run
+ shortly after boot.
+
+ If unsure, say N.
+
endif # RUNTIME_TESTING_MENU
config ARCH_USE_MEMTEST
diff --git a/lib/bitmap.c b/lib/bitmap.c
index 74ceb02f45e30..6e29b2aae6ba3 100644
--- a/lib/bitmap.c
+++ b/lib/bitmap.c
@@ -581,6 +581,14 @@ static const char *bitmap_parse_region(const char *str, struct region *r)
{
unsigned int lastbit = r->nbits - 1;
+ if (!strncasecmp(str, "all", 3)) {
+ r->start = 0;
+ r->end = lastbit;
+ str += 3;
+
+ goto check_pattern;
+ }
+
str = bitmap_getnum(str, &r->start, lastbit);
if (IS_ERR(str))
return str;
@@ -595,6 +603,7 @@ static const char *bitmap_parse_region(const char *str, struct region *r)
if (IS_ERR(str))
return str;
+check_pattern:
if (end_of_region(*str))
goto no_pattern;
diff --git a/lib/test_bitmap.c b/lib/test_bitmap.c
index 9cd5755831809..4ea73f5aed41f 100644
--- a/lib/test_bitmap.c
+++ b/lib/test_bitmap.c
@@ -366,6 +366,13 @@ static const struct test_bitmap_parselist parselist_tests[] __initconst = {
{0, "0-31:1/3,1-31:1/3,2-31:1/3", &exp1[8 * step], 32, 0},
{0, "1-10:8/12,8-31:24/29,0-31:0/3", &exp1[9 * step], 32, 0},
+ {0, "all", &exp1[8 * step], 32, 0},
+ {0, "0, 1, all, ", &exp1[8 * step], 32, 0},
+ {0, "all:1/2", &exp1[4 * step], 32, 0},
+ {0, "ALL:1/2", &exp1[4 * step], 32, 0},
+ {-EINVAL, "al", NULL, 8, 0},
+ {-EINVAL, "alll", NULL, 8, 0},
+
{-EINVAL, "-1", NULL, 8, 0},
{-EINVAL, "-0", NULL, 8, 0},
{-EINVAL, "10-1", NULL, 8, 0},
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index eefd3f5fde464..54527de9cd2de 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -922,7 +922,7 @@ static void __oom_kill_process(struct task_struct *victim, const char *message)
continue;
}
/*
- * No kthead_use_mm() user needs to read from the userspace so
+ * No kthread_use_mm() user needs to read from the userspace so
* we are ok to reap it.
*/
if (unlikely(p->flags & PF_KTHREAD))
diff --git a/mm/slab.h b/mm/slab.h
index 18c1927cd196c..7189daa0c586c 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -640,6 +640,7 @@ struct kmem_obj_info {
struct kmem_cache *kp_slab_cache;
void *kp_ret;
void *kp_stack[KS_ADDRS_COUNT];
+ void *kp_free_stack[KS_ADDRS_COUNT];
};
void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page);
#endif
diff --git a/mm/slab_common.c b/mm/slab_common.c
index a4a571428c511..cc7b8ee20b95e 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -574,7 +574,7 @@ EXPORT_SYMBOL_GPL(kmem_valid_obj);
* depends on the type of object and on how much debugging is enabled.
* For a slab-cache object, the fact that it is a slab object is printed,
* and, if available, the slab name, return address, and stack trace from
- * the allocation of that object.
+ * the allocation and last free path of that object.
*
* This function will splat if passed a pointer to a non-slab object.
* If you are not sure what type of object you have, you should instead
@@ -619,6 +619,16 @@ void kmem_dump_obj(void *object)
break;
pr_info(" %pS\n", kp.kp_stack[i]);
}
+
+ if (kp.kp_free_stack[0])
+ pr_cont(" Free path:\n");
+
+ for (i = 0; i < ARRAY_SIZE(kp.kp_free_stack); i++) {
+ if (!kp.kp_free_stack[i])
+ break;
+ pr_info(" %pS\n", kp.kp_free_stack[i]);
+ }
+
}
EXPORT_SYMBOL_GPL(kmem_dump_obj);
#endif
diff --git a/mm/slub.c b/mm/slub.c
index 3f96e099817a1..addcececa7555 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -3994,6 +3994,7 @@ void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page)
!(s->flags & SLAB_STORE_USER))
return;
#ifdef CONFIG_SLUB_DEBUG
+ objp = fixup_red_left(s, objp);
trackp = get_track(s, objp, TRACK_ALLOC);
kpp->kp_ret = (void *)trackp->addr;
#ifdef CONFIG_STACKTRACE
@@ -4002,6 +4003,13 @@ void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page)
if (!kpp->kp_stack[i])
break;
}
+
+ trackp = get_track(s, objp, TRACK_FREE);
+ for (i = 0; i < KS_ADDRS_COUNT && i < TRACK_ADDRS_COUNT; i++) {
+ kpp->kp_free_stack[i] = (void *)trackp->addrs[i];
+ if (!kpp->kp_free_stack[i])
+ break;
+ }
#endif
#endif
}
diff --git a/mm/util.c b/mm/util.c
index a8bf17f18a810..0b6dd9d81da79 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -983,7 +983,7 @@ int __weak memcmp_pages(struct page *page1, struct page *page2)
* depends on the type of object and on how much debugging is enabled.
* For example, for a slab-cache object, the slab name is printed, and,
* if available, the return address and stack trace from the allocation
- * of that object.
+ * and last free path of that object.
*/
void mem_dump_obj(void *object)
{
diff --git a/tools/memory-model/Documentation/access-marking.txt b/tools/memory-model/Documentation/access-marking.txt
index 1ab189f51f55d..a3dcc32e27b44 100644
--- a/tools/memory-model/Documentation/access-marking.txt
+++ b/tools/memory-model/Documentation/access-marking.txt
@@ -37,7 +37,9 @@ compiler's use of code-motion and common-subexpression optimizations.
Therefore, if a given access is involved in an intentional data race,
using READ_ONCE() for loads and WRITE_ONCE() for stores is usually
preferable to data_race(), which in turn is usually preferable to plain
-C-language accesses.
+C-language accesses. It is permissible to combine #2 and #3, for example,
+data_race(READ_ONCE(a)), which will both restrict compiler optimizations
+and disable KCSAN diagnostics.
KCSAN will complain about many types of data races involving plain
C-language accesses, but marking all accesses involved in a given data
@@ -86,6 +88,10 @@ that fail to exclude the updates. In this case, it is important to use
data_race() for the diagnostic reads because otherwise KCSAN would give
false-positive warnings about these diagnostic reads.
+If it is necessary to both restrict compiler optimizations and disable
+KCSAN diagnostics, use both data_race() and READ_ONCE(), for example,
+data_race(READ_ONCE(a)).
+
In theory, plain C-language loads can also be used for this use case.
However, in practice this will have the disadvantage of causing KCSAN
to generate false positives because KCSAN will have no way of knowing
@@ -126,6 +132,11 @@ consistent errors, which in turn are quite capable of breaking heuristics.
Therefore use of data_race() should be limited to cases where some other
code (such as a barrier() call) will force the occasional reload.
+Note that this use case requires that the heuristic be able to handle
+any possible error. In contrast, if the heuristics might be fatally
+confused by one or more of the possible erroneous values, use READ_ONCE()
+instead of data_race().
+
In theory, plain C-language loads can also be used for this use case.
However, in practice this will have the disadvantage of causing KCSAN
to generate false positives because KCSAN will have no way of knowing
@@ -259,9 +270,9 @@ diagnostic purposes. The code might look as follows:
return ret;
}
- int read_foo_diagnostic(void)
+ void read_foo_diagnostic(void)
{
- return data_race(foo);
+ pr_info("Current value of foo: %d\n", data_race(foo));
}
The reader-writer lock prevents the compiler from introducing concurrency
@@ -274,19 +285,34 @@ tells KCSAN that data races are expected, and should be silently
ignored. This data_race() also tells the human reading the code that
read_foo_diagnostic() might sometimes return a bogus value.
-However, please note that your kernel must be built with
-CONFIG_KCSAN_ASSUME_PLAIN_WRITES_ATOMIC=n in order for KCSAN to
-detect a buggy lockless write. If you need KCSAN to detect such a
-write even if that write did not change the value of foo, you also
-need CONFIG_KCSAN_REPORT_VALUE_CHANGE_ONLY=n. If you need KCSAN to
-detect such a write happening in an interrupt handler running on the
-same CPU doing the legitimate lock-protected write, you also need
-CONFIG_KCSAN_INTERRUPT_WATCHER=y. With some or all of these Kconfig
-options set properly, KCSAN can be quite helpful, although it is not
-necessarily a full replacement for hardware watchpoints. On the other
-hand, neither are hardware watchpoints a full replacement for KCSAN
-because it is not always easy to tell hardware watchpoint to conditionally
-trap on accesses.
+If it is necessary to suppress compiler optimization and also detect
+buggy lockless writes, read_foo_diagnostic() can be updated as follows:
+
+ void read_foo_diagnostic(void)
+ {
+ pr_info("Current value of foo: %d\n", data_race(READ_ONCE(foo)));
+ }
+
+Alternatively, given that KCSAN is to ignore all accesses in this function,
+this function can be marked __no_kcsan and the data_race() can be dropped:
+
+ void __no_kcsan read_foo_diagnostic(void)
+ {
+ pr_info("Current value of foo: %d\n", READ_ONCE(foo));
+ }
+
+However, in order for KCSAN to detect buggy lockless writes, your kernel
+must be built with CONFIG_KCSAN_ASSUME_PLAIN_WRITES_ATOMIC=n. If you
+need KCSAN to detect such a write even if that write did not change
+the value of foo, you also need CONFIG_KCSAN_REPORT_VALUE_CHANGE_ONLY=n.
+If you need KCSAN to detect such a write happening in an interrupt handler
+running on the same CPU doing the legitimate lock-protected write, you
+also need CONFIG_KCSAN_INTERRUPT_WATCHER=y. With some or all of these
+Kconfig options set properly, KCSAN can be quite helpful, although
+it is not necessarily a full replacement for hardware watchpoints.
+On the other hand, neither are hardware watchpoints a full replacement
+for KCSAN because it is not always easy to tell hardware watchpoint to
+conditionally trap on accesses.
Lock-Protected Writes With Lockless Reads
@@ -319,6 +345,100 @@ of the ASSERT_EXCLUSIVE_WRITER() is to allow KCSAN to check for a buggy
concurrent lockless write.
+Lock-Protected Writes With Heuristic Lockless Reads
+---------------------------------------------------
+
+For another example, suppose that the code can normally make use of
+a per-data-structure lock, but there are times when a global lock
+is required. These times are indicated via a global flag. The code
+might look as follows, and is based loosely on nf_conntrack_lock(),
+nf_conntrack_all_lock(), and nf_conntrack_all_unlock():
+
+ bool global_flag;
+ DEFINE_SPINLOCK(global_lock);
+ struct foo {
+ spinlock_t f_lock;
+ int f_data;
+ };
+
+ /* All foo structures are in the following array. */
+ int nfoo;
+ struct foo *foo_array;
+
+ void do_something_locked(struct foo *fp)
+ {
+ bool gf = true;
+
+ /* IMPORTANT: Heuristic plus spin_lock()! */
+ if (!data_race(global_flag)) {
+ spin_lock(&fp->f_lock);
+ if (!smp_load_acquire(&global_flag)) {
+ do_something(fp);
+ spin_unlock(&fp->f_lock);
+ return;
+ }
+ spin_unlock(&fp->f_lock);
+ }
+ spin_lock(&global_lock);
+ /* Lock held, thus global flag cannot change. */
+ if (!global_flag) {
+ spin_lock(&fp->f_lock);
+ spin_unlock(&global_lock);
+ gf = false;
+ }
+ do_something(fp);
+ if (fg)
+ spin_unlock(&global_lock);
+ else
+ spin_lock(&fp->f_lock);
+ }
+
+ void begin_global(void)
+ {
+ int i;
+
+ spin_lock(&global_lock);
+ WRITE_ONCE(global_flag, true);
+ for (i = 0; i < nfoo; i++) {
+ /* Wait for pre-existing local locks. */
+ spin_lock(&fp->f_lock);
+ spin_unlock(&fp->f_lock);
+ }
+ }
+
+ void end_global(void)
+ {
+ smp_store_release(&global_flag, false);
+ /* Pre-existing global lock acquisitions will recheck. */
+ spin_unlock(&global_lock);
+ }
+
+All code paths leading from the do_something_locked() function's first
+read from global_flag acquire a lock, so endless load fusing cannot
+happen.
+
+If the value read from global_flag is true, then global_flag is rechecked
+while holding global_lock, which prevents global_flag from changing.
+If this recheck finds that global_flag is now false, the acquisition
+of ->f_lock prior to the release of global_lock will result in any subsequent
+begin_global() invocation waiting to acquire ->f_lock.
+
+On the other hand, if the value read from global_flag is false, then
+global_flag, then rechecking under ->f_lock combined with synchronization
+with begin_global() guarantees than any erroneous read will cause the
+do_something_locked() function's first do_something() invocation to happen
+before begin_global() returns. The combination of the smp_load_acquire()
+in do_something_locked() and the smp_store_release() in end_global()
+guarantees that either the do_something_locked() function's first
+do_something() invocation happens after the call to end_global() or that
+do_something_locked() acquires global_lock() and rechecks under the lock.
+
+For this to work, only those foo structures in foo_array[] may be
+passed to do_something_locked(). The reason for this is that the
+synchronization with begin_global() relies on momentarily locking each
+and every foo structure.
+
+
Lockless Reads and Writes
-------------------------
diff --git a/tools/memory-model/Documentation/explanation.txt b/tools/memory-model/Documentation/explanation.txt
index f9d610d5a1a4c..5d72f3112e565 100644
--- a/tools/memory-model/Documentation/explanation.txt
+++ b/tools/memory-model/Documentation/explanation.txt
@@ -2510,7 +2510,7 @@ they behave as follows:
smp_mb__after_atomic() orders po-earlier atomic updates and
the events preceding them against all po-later events;
- smp_mb_after_spinlock() orders po-earlier lock acquisition
+ smp_mb__after_spinlock() orders po-earlier lock acquisition
events and the events preceding them against all po-later
events.
diff --git a/tools/memory-model/Documentation/locking.txt b/tools/memory-model/Documentation/locking.txt
new file mode 100644
index 0000000000000..4e05c6d53ab72
--- /dev/null
+++ b/tools/memory-model/Documentation/locking.txt
@@ -0,0 +1,320 @@
+Locking
+=======
+
+Locking is well-known and the common use cases are straightforward: Any
+CPU holding a given lock sees any changes previously seen or made by any
+CPU before it previously released that same lock. This last sentence
+is the only part of this document that most developers will need to read.
+
+However, developers who would like to also access lock-protected shared
+variables outside of their corresponding locks should continue reading.
+
+
+Locking and Prior Accesses
+--------------------------
+
+The basic rule of locking is worth repeating:
+
+ Any CPU holding a given lock sees any changes previously seen
+ or made by any CPU before it previously released that same lock.
+
+Note that this statement is a bit stronger than "Any CPU holding a
+given lock sees all changes made by any CPU during the time that CPU was
+previously holding this same lock". For example, consider the following
+pair of code fragments:
+
+ /* See MP+polocks.litmus. */
+ void CPU0(void)
+ {
+ WRITE_ONCE(x, 1);
+ spin_lock(&mylock);
+ WRITE_ONCE(y, 1);
+ spin_unlock(&mylock);
+ }
+
+ void CPU1(void)
+ {
+ spin_lock(&mylock);
+ r0 = READ_ONCE(y);
+ spin_unlock(&mylock);
+ r1 = READ_ONCE(x);
+ }
+
+The basic rule guarantees that if CPU0() acquires mylock before CPU1(),
+then both r0 and r1 must be set to the value 1. This also has the
+consequence that if the final value of r0 is equal to 1, then the final
+value of r1 must also be equal to 1. In contrast, the weaker rule would
+say nothing about the final value of r1.
+
+
+Locking and Subsequent Accesses
+-------------------------------
+
+The converse to the basic rule also holds: Any CPU holding a given
+lock will not see any changes that will be made by any CPU after it
+subsequently acquires this same lock. This converse statement is
+illustrated by the following litmus test:
+
+ /* See MP+porevlocks.litmus. */
+ void CPU0(void)
+ {
+ r0 = READ_ONCE(y);
+ spin_lock(&mylock);
+ r1 = READ_ONCE(x);
+ spin_unlock(&mylock);
+ }
+
+ void CPU1(void)
+ {
+ spin_lock(&mylock);
+ WRITE_ONCE(x, 1);
+ spin_unlock(&mylock);
+ WRITE_ONCE(y, 1);
+ }
+
+This converse to the basic rule guarantees that if CPU0() acquires
+mylock before CPU1(), then both r0 and r1 must be set to the value 0.
+This also has the consequence that if the final value of r1 is equal
+to 0, then the final value of r0 must also be equal to 0. In contrast,
+the weaker rule would say nothing about the final value of r0.
+
+These examples show only a single pair of CPUs, but the effects of the
+locking basic rule extend across multiple acquisitions of a given lock
+across multiple CPUs.
+
+
+Double-Checked Locking
+----------------------
+
+It is well known that more than just a lock is required to make
+double-checked locking work correctly, This litmus test illustrates
+one incorrect approach:
+
+ /* See Documentation/litmus-tests/locking/DCL-broken.litmus. */
+ P0(int *flag, int *data, int *lck)
+ {
+ int r0;
+ int r1;
+ int r2;
+
+ r0 = READ_ONCE(*flag);
+ if (r0 == 0) {
+ spin_lock(lck);
+ r1 = READ_ONCE(*flag);
+ if (r1 == 0) {
+ WRITE_ONCE(*data, 1);
+ WRITE_ONCE(*flag, 1);
+ }
+ spin_unlock(lck);
+ }
+ r2 = READ_ONCE(*data);
+ }
+ /* P1() is the exactly the same as P0(). */
+
+There are two problems. First, there is no ordering between the first
+READ_ONCE() of "flag" and the READ_ONCE() of "data". Second, there is
+no ordering between the two WRITE_ONCE() calls. It should therefore be
+no surprise that "r2" can be zero, and a quick herd7 run confirms this.
+
+One way to fix this is to use smp_load_acquire() and smp_store_release()
+as shown in this corrected version:
+
+ /* See Documentation/litmus-tests/locking/DCL-fixed.litmus. */
+ P0(int *flag, int *data, int *lck)
+ {
+ int r0;
+ int r1;
+ int r2;
+
+ r0 = smp_load_acquire(flag);
+ if (r0 == 0) {
+ spin_lock(lck);
+ r1 = READ_ONCE(*flag);
+ if (r1 == 0) {
+ WRITE_ONCE(*data, 1);
+ smp_store_release(flag, 1);
+ }
+ spin_unlock(lck);
+ }
+ r2 = READ_ONCE(*data);
+ }
+ /* P1() is the exactly the same as P0(). */
+
+The smp_load_acquire() guarantees that its load from "flags" will
+be ordered before the READ_ONCE() from data, thus solving the first
+problem. The smp_store_release() guarantees that its store will be
+ordered after the WRITE_ONCE() to "data", solving the second problem.
+The smp_store_release() pairs with the smp_load_acquire(), thus ensuring
+that the ordering provided by each actually takes effect. Again, a
+quick herd7 run confirms this.
+
+In short, if you access a lock-protected variable without holding the
+corresponding lock, you will need to provide additional ordering, in
+this case, via the smp_load_acquire() and the smp_store_release().
+
+
+Ordering Provided by a Lock to CPUs Not Holding That Lock
+---------------------------------------------------------
+
+It is not necessarily the case that accesses ordered by locking will be
+seen as ordered by CPUs not holding that lock. Consider this example:
+
+ /* See Z6.0+pooncelock+pooncelock+pombonce.litmus. */
+ void CPU0(void)
+ {
+ spin_lock(&mylock);
+ WRITE_ONCE(x, 1);
+ WRITE_ONCE(y, 1);
+ spin_unlock(&mylock);
+ }
+
+ void CPU1(void)
+ {
+ spin_lock(&mylock);
+ r0 = READ_ONCE(y);
+ WRITE_ONCE(z, 1);
+ spin_unlock(&mylock);
+ }
+
+ void CPU2(void)
+ {
+ WRITE_ONCE(z, 2);
+ smp_mb();
+ r1 = READ_ONCE(x);
+ }
+
+Counter-intuitive though it might be, it is quite possible to have
+the final value of r0 be 1, the final value of z be 2, and the final
+value of r1 be 0. The reason for this surprising outcome is that CPU2()
+never acquired the lock, and thus did not fully benefit from the lock's
+ordering properties.
+
+Ordering can be extended to CPUs not holding the lock by careful use
+of smp_mb__after_spinlock():
+
+ /* See Z6.0+pooncelock+poonceLock+pombonce.litmus. */
+ void CPU0(void)
+ {
+ spin_lock(&mylock);
+ WRITE_ONCE(x, 1);
+ WRITE_ONCE(y, 1);
+ spin_unlock(&mylock);
+ }
+
+ void CPU1(void)
+ {
+ spin_lock(&mylock);
+ smp_mb__after_spinlock();
+ r0 = READ_ONCE(y);
+ WRITE_ONCE(z, 1);
+ spin_unlock(&mylock);
+ }
+
+ void CPU2(void)
+ {
+ WRITE_ONCE(z, 2);
+ smp_mb();
+ r1 = READ_ONCE(x);
+ }
+
+This addition of smp_mb__after_spinlock() strengthens the lock
+acquisition sufficiently to rule out the counter-intuitive outcome.
+In other words, the addition of the smp_mb__after_spinlock() prohibits
+the counter-intuitive result where the final value of r0 is 1, the final
+value of z is 2, and the final value of r1 is 0.
+
+
+No Roach-Motel Locking!
+-----------------------
+
+This example requires familiarity with the herd7 "filter" clause, so
+please read up on that topic in litmus-tests.txt.
+
+It is tempting to allow memory-reference instructions to be pulled
+into a critical section, but this cannot be allowed in the general case.
+For example, consider a spin loop preceding a lock-based critical section.
+Now, herd7 does not model spin loops, but we can emulate one with two
+loads, with a "filter" clause to constrain the first to return the
+initial value and the second to return the updated value, as shown below:
+
+ /* See Documentation/litmus-tests/locking/RM-fixed.litmus. */
+ P0(int *x, int *y, int *lck)
+ {
+ int r2;
+
+ spin_lock(lck);
+ r2 = atomic_inc_return(y);
+ WRITE_ONCE(*x, 1);
+ spin_unlock(lck);
+ }
+
+ P1(int *x, int *y, int *lck)
+ {
+ int r0;
+ int r1;
+ int r2;
+
+ r0 = READ_ONCE(*x);
+ r1 = READ_ONCE(*x);
+ spin_lock(lck);
+ r2 = atomic_inc_return(y);
+ spin_unlock(lck);
+ }
+
+ filter (y=2 /\ 1:r0=0 /\ 1:r1=1)
+ exists (1:r2=1)
+
+The variable "x" is the control variable for the emulated spin loop.
+P0() sets it to "1" while holding the lock, and P1() emulates the
+spin loop by reading it twice, first into "1:r0" (which should get the
+initial value "0") and then into "1:r1" (which should get the updated
+value "1").
+
+The purpose of the variable "y" is to reject deadlocked executions.
+Only those executions where the final value of "y" have avoided deadlock.
+
+The "filter" clause takes all this into account, constraining "y" to
+equal "2", "1:r0" to equal "0", and "1:r1" to equal 1.
+
+Then the "exists" clause checks to see if P1() acquired its lock first,
+which should not happen given the filter clause because P0() updates
+"x" while holding the lock. And herd7 confirms this.
+
+But suppose that the compiler was permitted to reorder the spin loop
+into P1()'s critical section, like this:
+
+ /* See Documentation/litmus-tests/locking/RM-broken.litmus. */
+ P0(int *x, int *y, int *lck)
+ {
+ int r2;
+
+ spin_lock(lck);
+ r2 = atomic_inc_return(y);
+ WRITE_ONCE(*x, 1);
+ spin_unlock(lck);
+ }
+
+ P1(int *x, int *y, int *lck)
+ {
+ int r0;
+ int r1;
+ int r2;
+
+ spin_lock(lck);
+ r0 = READ_ONCE(*x);
+ r1 = READ_ONCE(*x);
+ r2 = atomic_inc_return(y);
+ spin_unlock(lck);
+ }
+
+ locations [x;lck;0:r2;1:r0;1:r1;1:r2]
+ filter (y=2 /\ 1:r0=0 /\ 1:r1=1)
+ exists (1:r2=1)
+
+If "1:r0" is equal to "0", "1:r1" can never equal "1" because P0()
+cannot update "x" while P1() holds the lock. And herd7 confirms this,
+showing zero executions matching the "filter" criteria.
+
+And this is why Linux-kernel lock and unlock primitives must prevent
+code from entering critical sections. It is not sufficient to only
+prevent code from leaving them.
diff --git a/tools/memory-model/litmus-tests/.gitignore b/tools/memory-model/litmus-tests/.gitignore
index c492a1ddad91d..19c379cf069d2 100644
--- a/tools/memory-model/litmus-tests/.gitignore
+++ b/tools/memory-model/litmus-tests/.gitignore
@@ -1,2 +1,2 @@
# SPDX-License-Identifier: GPL-2.0-only
-*.litmus.out
+*.litmus.*
diff --git a/tools/memory-model/scripts/README b/tools/memory-model/scripts/README
index 095c7eb36f9f9..cc2c4e5be9ec1 100644
--- a/tools/memory-model/scripts/README
+++ b/tools/memory-model/scripts/README
@@ -27,6 +27,14 @@ checklitmushist.sh
checklitmus.sh
Check a single litmus test against its "Result:" expected result.
+ Not intended to for manual use.
+
+checktheselitmus.sh
+
+ Check the specified list of litmus tests against their "Result:"
+ expected results. This takes optional parseargs.sh arguments,
+ followed by "--" followed by pathnames starting from the current
+ directory.
cmplitmushist.sh
@@ -43,10 +51,10 @@ initlitmushist.sh
judgelitmus.sh
- Given a .litmus file and its .litmus.out herd7 output, check the
- .litmus.out file against the .litmus file's "Result:" comment to
- judge whether the test ran correctly. Not normally run manually,
- provided instead for use by other scripts.
+ Given a .litmus file and its herd7 output, check the output file
+ against the .litmus file's "Result:" comment to judge whether
+ the test ran correctly. Not normally run manually, provided
+ instead for use by other scripts.
newlitmushist.sh
diff --git a/tools/memory-model/scripts/checkalllitmus.sh b/tools/memory-model/scripts/checkalllitmus.sh
index 3c0c7fbbd223b..2d3ee850a8399 100755
--- a/tools/memory-model/scripts/checkalllitmus.sh
+++ b/tools/memory-model/scripts/checkalllitmus.sh
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
# SPDX-License-Identifier: GPL-2.0+
#
# Run herd7 tests on all .litmus files in the litmus-tests directory
@@ -8,6 +8,11 @@
# "^^^". It also outputs verification results to a file whose name is
# that of the specified litmus test, but with ".out" appended.
#
+# If the --hw argument is specified, this script translates the .litmus
+# C-language file to the specified type of assembly and verifies that.
+# But in this case, litmus tests using complex synchronization (such as
+# locking, RCU, and SRCU) are cheerfully ignored.
+#
# Usage:
# checkalllitmus.sh
#
@@ -17,7 +22,7 @@
#
# Copyright IBM Corporation, 2018
#
-# Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+# Author: Paul E. McKenney <paulmck@linux.ibm.com>
. scripts/parseargs.sh
@@ -30,29 +35,23 @@ else
exit 255
fi
-# Create any new directories that have appeared in the github litmus
-# repo since the last run.
+# Create any new directories that have appeared in the litmus-tests
+# directory since the last run.
if test "$LKMM_DESTDIR" != "."
then
find $litmusdir -type d -print |
( cd "$LKMM_DESTDIR"; sed -e 's/^/mkdir -p /' | sh )
fi
-# Find the checklitmus script. If it is not where we expect it, then
-# assume that the caller has the PATH environment variable set
-# appropriately.
-if test -x scripts/checklitmus.sh
-then
- clscript=scripts/checklitmus.sh
-else
- clscript=checklitmus.sh
-fi
-
# Run the script on all the litmus tests in the specified directory
ret=0
for i in $litmusdir/*.litmus
do
- if ! $clscript $i
+ if test -n "$LKMM_HW_MAP_FILE" && ! scripts/simpletest.sh $i
+ then
+ continue
+ fi
+ if ! scripts/checklitmus.sh $i
then
ret=1
fi
diff --git a/tools/memory-model/scripts/checkghlitmus.sh b/tools/memory-model/scripts/checkghlitmus.sh
index 6589fbb6f6538..cedd0290b73f8 100755
--- a/tools/memory-model/scripts/checkghlitmus.sh
+++ b/tools/memory-model/scripts/checkghlitmus.sh
@@ -10,6 +10,7 @@
# parseargs.sh scripts for arguments.
. scripts/parseargs.sh
+. scripts/hwfnseg.sh
T=/tmp/checkghlitmus.sh.$$
trap 'rm -rf $T' 0
@@ -32,19 +33,19 @@ then
( cd "$LKMM_DESTDIR"; sed -e 's/^/mkdir -p /' | sh )
fi
-# Create a list of the C-language litmus tests previously run.
-( cd $LKMM_DESTDIR; find litmus -name '*.litmus.out' -print ) |
- sed -e 's/\.out$//' |
+# Create a list of the specified litmus tests previously run.
+( cd $LKMM_DESTDIR; find litmus -name "*.litmus${hwfnseg}.out" -print ) |
+ sed -e "s/${hwfnseg}"'\.out$//' |
xargs -r egrep -l '^ \* Result: (Never|Sometimes|Always|DEADLOCK)' |
xargs -r grep -L "^P${LKMM_PROCS}"> $T/list-C-already
# Create a list of C-language litmus tests with "Result:" commands and
# no more than the specified number of processes.
-find litmus -name '*.litmus' -exec grep -l -m 1 "^C " {} \; > $T/list-C
+find litmus -name '*.litmus' -print | mselect7 -arch C > $T/list-C
xargs < $T/list-C -r egrep -l '^ \* Result: (Never|Sometimes|Always|DEADLOCK)' > $T/list-C-result
xargs < $T/list-C-result -r grep -L "^P${LKMM_PROCS}" > $T/list-C-result-short
-# Form list of tests without corresponding .litmus.out files
+# Form list of tests without corresponding .out files
sort $T/list-C-already $T/list-C-result-short | uniq -u > $T/list-C-needed
# Run any needed tests.
diff --git a/tools/memory-model/scripts/checklitmus.sh b/tools/memory-model/scripts/checklitmus.sh
index 11461ed40b5e4..4c1d0cf0ddadc 100755
--- a/tools/memory-model/scripts/checklitmus.sh
+++ b/tools/memory-model/scripts/checklitmus.sh
@@ -1,10 +1,8 @@
#!/bin/sh
# SPDX-License-Identifier: GPL-2.0+
#
-# Run a herd7 test and invokes judgelitmus.sh to check the result against
-# a "Result:" comment within the litmus test. It also outputs verification
-# results to a file whose name is that of the specified litmus test, but
-# with ".out" appended.
+# Invokes runlitmus.sh and judgelitmus.sh on its arguments to run the
+# specified litmus test and pass judgment on the results.
#
# Usage:
# checklitmus.sh file.litmus
@@ -15,20 +13,7 @@
#
# Copyright IBM Corporation, 2018
#
-# Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+# Author: Paul E. McKenney <paulmck@linux.ibm.com>
-litmus=$1
-herdoptions=${LKMM_HERD_OPTIONS--conf linux-kernel.cfg}
-
-if test -f "$litmus" -a -r "$litmus"
-then
- :
-else
- echo ' --- ' error: \"$litmus\" is not a readable file
- exit 255
-fi
-
-echo Herd options: $herdoptions > $LKMM_DESTDIR/$litmus.out
-/usr/bin/time $LKMM_TIMEOUT_CMD herd7 $herdoptions $litmus >> $LKMM_DESTDIR/$litmus.out 2>&1
-
-scripts/judgelitmus.sh $litmus
+scripts/runlitmus.sh $1
+scripts/judgelitmus.sh $1
diff --git a/tools/memory-model/scripts/checklitmushist.sh b/tools/memory-model/scripts/checklitmushist.sh
index 1d210ffb7c8af..406ecfc0aee4c 100755
--- a/tools/memory-model/scripts/checklitmushist.sh
+++ b/tools/memory-model/scripts/checklitmushist.sh
@@ -12,7 +12,7 @@
#
# Copyright IBM Corporation, 2018
#
-# Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+# Author: Paul E. McKenney <paulmck@linux.ibm.com>
. scripts/parseargs.sh
diff --git a/tools/memory-model/scripts/checktheselitmus.sh b/tools/memory-model/scripts/checktheselitmus.sh
new file mode 100755
index 0000000000000..10eeb5ecea6de
--- /dev/null
+++ b/tools/memory-model/scripts/checktheselitmus.sh
@@ -0,0 +1,43 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0+
+#
+# Invokes checklitmus.sh on its arguments to run the specified litmus
+# test and pass judgment on the results.
+#
+# Usage:
+# checktheselitmus.sh -- [ file1.litmus [ file2.litmus ... ] ]
+#
+# Run this in the directory containing the memory model, specifying the
+# pathname of the litmus test to check. The usual parseargs.sh arguments
+# can be specified prior to the "--".
+#
+# This script is intended for use with pathnames that start from the
+# tools/memory-model directory. If some of the pathnames instead start at
+# the root directory, they all must do so and the "--destdir /" parseargs.sh
+# argument must be specified prior to the "--". Alternatively, some other
+# "--destdir" argument can be supplied as long as the needed subdirectories
+# are populated.
+#
+# Copyright IBM Corporation, 2018
+#
+# Author: Paul E. McKenney <paulmck@linux.ibm.com>
+
+. scripts/parseargs.sh
+
+ret=0
+for i in "$@"
+do
+ if scripts/checklitmus.sh $i
+ then
+ :
+ else
+ ret=1
+ fi
+done
+if test "$ret" -ne 0
+then
+ echo " ^^^ VERIFICATION MISMATCHES" 1>&2
+else
+ echo All litmus tests verified as was expected. 1>&2
+fi
+exit $ret
diff --git a/tools/memory-model/scripts/cmplitmushist.sh b/tools/memory-model/scripts/cmplitmushist.sh
index 0f498aeeccf5e..ca1ac8b646144 100755
--- a/tools/memory-model/scripts/cmplitmushist.sh
+++ b/tools/memory-model/scripts/cmplitmushist.sh
@@ -12,12 +12,49 @@ trap 'rm -rf $T' 0
mkdir $T
# comparetest oldpath newpath
+badmacnam=0
+timedout=0
perfect=0
obsline=0
noobsline=0
obsresult=0
badcompare=0
comparetest () {
+ if grep -q ': Unknown macro ' $1 || grep -q ': Unknown macro ' $2
+ then
+ if grep -q ': Unknown macro ' $1
+ then
+ badname=`grep ': Unknown macro ' $1 |
+ sed -e 's/^.*: Unknown macro //' |
+ sed -e 's/ (User error).*$//'`
+ echo 'Current LKMM version does not know "'$badname'"' $1
+ fi
+ if grep -q ': Unknown macro ' $2
+ then
+ badname=`grep ': Unknown macro ' $2 |
+ sed -e 's/^.*: Unknown macro //' |
+ sed -e 's/ (User error).*$//'`
+ echo 'Current LKMM version does not know "'$badname'"' $2
+ fi
+ badmacnam=`expr "$badmacnam" + 1`
+ return 0
+ elif grep -q '^Command exited with non-zero status 124' $1 ||
+ grep -q '^Command exited with non-zero status 124' $2
+ then
+ if grep -q '^Command exited with non-zero status 124' $1 &&
+ grep -q '^Command exited with non-zero status 124' $2
+ then
+ echo Both runs timed out: $2
+ elif grep -q '^Command exited with non-zero status 124' $1
+ then
+ echo Old run timed out: $2
+ elif grep -q '^Command exited with non-zero status 124' $2
+ then
+ echo New run timed out: $2
+ fi
+ timedout=`expr "$timedout" + 1`
+ return 0
+ fi
grep -v 'maxresident)k\|minor)pagefaults\|^Time' $1 > $T/oldout
grep -v 'maxresident)k\|minor)pagefaults\|^Time' $2 > $T/newout
if cmp -s $T/oldout $T/newout && grep -q '^Observation' $1
@@ -38,7 +75,7 @@ comparetest () {
return 0
fi
else
- echo Missing Observation line "(e.g., herd7 timeout)": $2
+ echo Missing Observation line "(e.g., syntax error)": $2
noobsline=`expr "$noobsline" + 1`
return 0
fi
@@ -72,12 +109,20 @@ then
fi
if test "$noobsline" -ne 0
then
- echo Missing Observation line "(e.g., herd7 timeout)": $noobsline 1>&2
+ echo Missing Observation line "(e.g., syntax error)": $noobsline 1>&2
fi
if test "$obsresult" -ne 0
then
echo Matching Observation Always/Sometimes/Never result: $obsresult 1>&2
fi
+if test "$timedout" -ne 0
+then
+ echo "!!!" Timed out: $timedout 1>&2
+fi
+if test "$badmacnam" -ne 0
+then
+ echo "!!!" Unknown primitive: $badmacnam 1>&2
+fi
if test "$badcompare" -ne 0
then
echo "!!!" Result changed: $badcompare 1>&2
diff --git a/tools/memory-model/scripts/hwfnseg.sh b/tools/memory-model/scripts/hwfnseg.sh
new file mode 100755
index 0000000000000..580c3281181c5
--- /dev/null
+++ b/tools/memory-model/scripts/hwfnseg.sh
@@ -0,0 +1,20 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0+
+#
+# Generate the hardware extension to the litmus-test filename, or the
+# empty string if this is an LKMM run. The extension is placed in
+# the shell variable hwfnseg.
+#
+# Usage:
+# . hwfnseg.sh
+#
+# Copyright IBM Corporation, 2019
+#
+# Author: Paul E. McKenney <paulmck@linux.ibm.com>
+
+if test -z "$LKMM_HW_MAP_FILE"
+then
+ hwfnseg=
+else
+ hwfnseg=".$LKMM_HW_MAP_FILE"
+fi
diff --git a/tools/memory-model/scripts/initlitmushist.sh b/tools/memory-model/scripts/initlitmushist.sh
index 956b6957484d8..31ea782955d3f 100755
--- a/tools/memory-model/scripts/initlitmushist.sh
+++ b/tools/memory-model/scripts/initlitmushist.sh
@@ -60,7 +60,7 @@ fi
# Create a list of the C-language litmus tests with no more than the
# specified number of processes (per the --procs argument).
-find litmus -name '*.litmus' -exec grep -l -m 1 "^C " {} \; > $T/list-C
+find litmus -name '*.litmus' -print | mselect7 -arch C > $T/list-C
xargs < $T/list-C -r grep -L "^P${LKMM_PROCS}" > $T/list-C-short
scripts/runlitmushist.sh < $T/list-C-short
diff --git a/tools/memory-model/scripts/judgelitmus.sh b/tools/memory-model/scripts/judgelitmus.sh
index 0cc63875e395d..1ec5d89fcfbb2 100755
--- a/tools/memory-model/scripts/judgelitmus.sh
+++ b/tools/memory-model/scripts/judgelitmus.sh
@@ -1,9 +1,22 @@
#!/bin/sh
# SPDX-License-Identifier: GPL-2.0+
#
-# Given a .litmus test and the corresponding .litmus.out file, check
-# the .litmus.out file against the "Result:" comment to judge whether
-# the test ran correctly.
+# Given a .litmus test and the corresponding litmus output file, check
+# the .litmus.out file against the "Result:" comment to judge whether the
+# test ran correctly. If the --hw argument is omitted, check against the
+# LKMM output, which is assumed to be in file.litmus.out. If either a
+# "DATARACE" marker in the "Result:" comment or a "Flag data-race" marker
+# in the LKMM output is present, the other must also be as well, at least
+# for litmus tests having a "Result:" comment. In this case, a failure of
+# the Always/Sometimes/Never portion of the "Result:" prediction will be
+# noted, but forgiven.
+#
+# If the --hw argument is provided, this is assumed to be a hardware
+# test, and the output is assumed to be in file.litmus.HW.out, where
+# "HW" is the --hw argument. In addition, non-Sometimes verification
+# results will be noted, but forgiven. Furthermore, if there is no
+# "Result:" comment but there is an LKMM .litmus.out file, the observation
+# in that file will be used to judge the assembly-language verification.
#
# Usage:
# judgelitmus.sh file.litmus
@@ -13,7 +26,7 @@
#
# Copyright IBM Corporation, 2018
#
-# Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+# Author: Paul E. McKenney <paulmck@linux.ibm.com>
litmus=$1
@@ -24,55 +37,120 @@ else
echo ' --- ' error: \"$litmus\" is not a readable file
exit 255
fi
-if test -f "$LKMM_DESTDIR/$litmus".out -a -r "$LKMM_DESTDIR/$litmus".out
+if test -z "$LKMM_HW_MAP_FILE"
+then
+ litmusout=$litmus.out
+ lkmmout=
+else
+ litmusout="`echo $litmus |
+ sed -e 's/\.litmus$/.litmus.'${LKMM_HW_MAP_FILE}'/'`.out"
+ lkmmout=$litmus.out
+fi
+if test -f "$LKMM_DESTDIR/$litmusout" -a -r "$LKMM_DESTDIR/$litmusout"
then
:
else
- echo ' --- ' error: \"$LKMM_DESTDIR/$litmus\".out is not a readable file
+ echo ' --- ' error: \"$LKMM_DESTDIR/$litmusout is not a readable file
exit 255
fi
-if grep -q '^ \* Result: ' $litmus
+if grep -q '^Flag data-race$' "$LKMM_DESTDIR/$litmusout"
+then
+ datarace_modeled=1
+fi
+if grep -q '^[( ]\* Result: ' $litmus
+then
+ outcome=`grep -m 1 '^[( ]\* Result: ' $litmus | awk '{ print $3 }'`
+ if grep -m1 '^[( ]\* Result: .* DATARACE' $litmus
+ then
+ datarace_predicted=1
+ fi
+ if test -n "$datarace_predicted" -a -z "$datarace_modeled" -a -z "$LKMM_HW_MAP_FILE"
+ then
+ echo '!!! Predicted data race not modeled' $litmus
+ exit 252
+ elif test -z "$datarace_predicted" -a -n "$datarace_modeled"
+ then
+ # Note that hardware models currently don't model data races
+ echo '!!! Unexpected data race modeled' $litmus
+ exit 253
+ fi
+elif test -n "$LKMM_HW_MAP_FILE" && grep -q '^Observation' $LKMM_DESTDIR/$lkmmout > /dev/null 2>&1
then
- outcome=`grep -m 1 '^ \* Result: ' $litmus | awk '{ print $3 }'`
+ outcome=`grep -m 1 '^Observation ' $LKMM_DESTDIR/$lkmmout | awk '{ print $3 }'`
else
outcome=specified
fi
-grep '^Observation' $LKMM_DESTDIR/$litmus.out
-if grep -q '^Observation' $LKMM_DESTDIR/$litmus.out
+grep '^Observation' $LKMM_DESTDIR/$litmusout
+if grep -q '^Observation' $LKMM_DESTDIR/$litmusout
then
:
+elif grep ': Unknown macro ' $LKMM_DESTDIR/$litmusout
+then
+ badname=`grep ': Unknown macro ' $LKMM_DESTDIR/$litmusout |
+ sed -e 's/^.*: Unknown macro //' |
+ sed -e 's/ (User error).*$//'`
+ badmsg=' !!! Current LKMM version does not know "'$badname'"'" $litmus"
+ echo $badmsg
+ if ! grep -q '!!!' $LKMM_DESTDIR/$litmusout
+ then
+ echo ' !!! '$badmsg >> $LKMM_DESTDIR/$litmusout 2>&1
+ fi
+ exit 254
+elif grep '^Command exited with non-zero status 124' $LKMM_DESTDIR/$litmusout
+then
+ echo ' !!! Timeout' $litmus
+ if ! grep -q '!!!' $LKMM_DESTDIR/$litmusout
+ then
+ echo ' !!! Timeout' >> $LKMM_DESTDIR/$litmusout 2>&1
+ fi
+ exit 124
else
echo ' !!! Verification error' $litmus
- if ! grep -q '!!!' $LKMM_DESTDIR/$litmus.out
+ if ! grep -q '!!!' $LKMM_DESTDIR/$litmusout
then
- echo ' !!! Verification error' >> $LKMM_DESTDIR/$litmus.out 2>&1
+ echo ' !!! Verification error' >> $LKMM_DESTDIR/$litmusout 2>&1
fi
exit 255
fi
if test "$outcome" = DEADLOCK
then
- if grep '^Observation' $LKMM_DESTDIR/$litmus.out | grep -q 'Never 0 0$'
+ if grep '^Observation' $LKMM_DESTDIR/$litmusout | grep -q 'Never 0 0$'
then
ret=0
else
echo " !!! Unexpected non-$outcome verification" $litmus
- if ! grep -q '!!!' $LKMM_DESTDIR/$litmus.out
+ if ! grep -q '!!!' $LKMM_DESTDIR/$litmusout
then
- echo " !!! Unexpected non-$outcome verification" >> $LKMM_DESTDIR/$litmus.out 2>&1
+ echo " !!! Unexpected non-$outcome verification" >> $LKMM_DESTDIR/$litmusout 2>&1
fi
ret=1
fi
-elif grep '^Observation' $LKMM_DESTDIR/$litmus.out | grep -q $outcome || test "$outcome" = Maybe
+elif grep '^Observation' $LKMM_DESTDIR/$litmusout | grep -q 'Never 0 0$'
+then
+ echo " !!! Unexpected non-$outcome deadlock" $litmus
+ if ! grep -q '!!!' $LKMM_DESTDIR/$litmusout
+ then
+ echo " !!! Unexpected non-$outcome deadlock" $litmus >> $LKMM_DESTDIR/$litmusout 2>&1
+ fi
+ ret=1
+elif grep '^Observation' $LKMM_DESTDIR/$litmusout | grep -q $outcome || test "$outcome" = Maybe
then
ret=0
else
- echo " !!! Unexpected non-$outcome verification" $litmus
- if ! grep -q '!!!' $LKMM_DESTDIR/$litmus.out
+ if test \( -n "$LKMM_HW_MAP_FILE" -a "$outcome" = Sometimes \) -o -n "$datarace_modeled"
then
- echo " !!! Unexpected non-$outcome verification" >> $LKMM_DESTDIR/$litmus.out 2>&1
+ flag="--- Forgiven"
+ ret=0
+ else
+ flag="!!! Unexpected"
+ ret=1
+ fi
+ echo " $flag non-$outcome verification" $litmus
+ if ! grep -qe "$flag" $LKMM_DESTDIR/$litmusout
+ then
+ echo " $flag non-$outcome verification" >> $LKMM_DESTDIR/$litmusout 2>&1
fi
- ret=1
fi
-tail -2 $LKMM_DESTDIR/$litmus.out | head -1
+tail -2 $LKMM_DESTDIR/$litmusout | head -1
exit $ret
diff --git a/tools/memory-model/scripts/newlitmushist.sh b/tools/memory-model/scripts/newlitmushist.sh
index 991f8f8148817..25235e2049cf0 100755
--- a/tools/memory-model/scripts/newlitmushist.sh
+++ b/tools/memory-model/scripts/newlitmushist.sh
@@ -12,7 +12,7 @@
#
# Copyright IBM Corporation, 2018
#
-# Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+# Author: Paul E. McKenney <paulmck@linux.ibm.com>
. scripts/parseargs.sh
@@ -43,7 +43,7 @@ fi
# Form full list of litmus tests with no more than the specified
# number of processes (per the --procs argument).
-find litmus -name '*.litmus' -exec grep -l -m 1 "^C " {} \; > $T/list-C-all
+find litmus -name '*.litmus' -print | mselect7 -arch C > $T/list-C-all
xargs < $T/list-C-all -r grep -L "^P${LKMM_PROCS}" > $T/list-C-short
# Form list of new tests. Note: This does not handle litmus-test deletion!
diff --git a/tools/memory-model/scripts/parseargs.sh b/tools/memory-model/scripts/parseargs.sh
index 40f52080fdbd6..08ded59098607 100755
--- a/tools/memory-model/scripts/parseargs.sh
+++ b/tools/memory-model/scripts/parseargs.sh
@@ -1,7 +1,7 @@
#!/bin/sh
# SPDX-License-Identifier: GPL-2.0+
#
-# the corresponding .litmus.out file, and does not judge the result.
+# Parse arguments common to the various scripts.
#
# . scripts/parseargs.sh
#
@@ -9,7 +9,7 @@
#
# Copyright IBM Corporation, 2018
#
-# Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+# Author: Paul E. McKenney <paulmck@linux.ibm.com>
T=/tmp/parseargs.sh.$$
mkdir $T
@@ -27,6 +27,7 @@ initparam () {
initparam LKMM_DESTDIR "."
initparam LKMM_HERD_OPTIONS "-conf linux-kernel.cfg"
+initparam LKMM_HW_MAP_FILE ""
initparam LKMM_JOBS `getconf _NPROCESSORS_ONLN`
initparam LKMM_PROCS "3"
initparam LKMM_TIMEOUT "1m"
@@ -37,10 +38,11 @@ usagehelp () {
echo "Usage $scriptname [ arguments ]"
echo " --destdir path (place for .litmus.out, default by .litmus)"
echo " --herdopts -conf linux-kernel.cfg ..."
+ echo " --hw AArch64"
echo " --jobs N (number of jobs, default one per CPU)"
echo " --procs N (litmus tests with at most this many processes)"
echo " --timeout N (herd7 timeout (e.g., 10s, 1m, 2hr, 1d, '')"
- echo "Defaults: --destdir '$LKMM_DESTDIR_DEF' --herdopts '$LKMM_HERD_OPTIONS_DEF' --jobs '$LKMM_JOBS_DEF' --procs '$LKMM_PROCS_DEF' --timeout '$LKMM_TIMEOUT_DEF'"
+ echo "Defaults: --destdir '$LKMM_DESTDIR_DEF' --herdopts '$LKMM_HERD_OPTIONS_DEF' --hw '$LKMM_HW_MAP_FILE' --jobs '$LKMM_JOBS_DEF' --procs '$LKMM_PROCS_DEF' --timeout '$LKMM_TIMEOUT_DEF'"
exit 1
}
@@ -81,7 +83,7 @@ do
echo "Cannot create directory --destdir '$LKMM_DESTDIR'"
usage
fi
- if test -d "$LKMM_DESTDIR" -a -w "$LKMM_DESTDIR" -a -x "$LKMM_DESTDIR"
+ if test -d "$LKMM_DESTDIR" -a -x "$LKMM_DESTDIR"
then
:
else
@@ -95,6 +97,11 @@ do
LKMM_HERD_OPTIONS="$2"
shift
;;
+ --hw)
+ checkarg --hw "(.map file architecture name)" "$#" "$2" '^[A-Za-z0-9_-]\+' '^--'
+ LKMM_HW_MAP_FILE="$2"
+ shift
+ ;;
-j[1-9]*)
njobs="`echo $1 | sed -e 's/^-j//'`"
trailchars="`echo $njobs | sed -e 's/[0-9]\+\(.*\)$/\1/'`"
@@ -106,7 +113,7 @@ do
LKMM_JOBS="`echo $njobs | sed -e 's/^\([0-9]\+\).*$/\1/'`"
;;
--jobs|--job|-j)
- checkarg --jobs "(number)" "$#" "$2" '^[1-9][0-9]\+$' '^--'
+ checkarg --jobs "(number)" "$#" "$2" '^[1-9][0-9]*$' '^--'
LKMM_JOBS="$2"
shift
;;
@@ -120,6 +127,10 @@ do
LKMM_TIMEOUT="$2"
shift
;;
+ --)
+ shift
+ break
+ ;;
*)
echo Unknown argument $1
usage
diff --git a/tools/memory-model/scripts/runlitmus.sh b/tools/memory-model/scripts/runlitmus.sh
new file mode 100755
index 0000000000000..94608d4b6502e
--- /dev/null
+++ b/tools/memory-model/scripts/runlitmus.sh
@@ -0,0 +1,80 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0+
+#
+# Without the -hw argument, runs a herd7 test and outputs verification
+# results to a file whose name is that of the specified litmus test,
+# but with ".out" appended.
+#
+# If the --hw argument is specified, this script translates the .litmus
+# C-language file to the specified type of assembly and verifies that.
+# But in this case, litmus tests using complex synchronization (such as
+# locking, RCU, and SRCU) are cheerfully ignored.
+#
+# Either way, return the status of the herd7 command.
+#
+# Usage:
+# runlitmus.sh file.litmus
+#
+# Run this in the directory containing the memory model, specifying the
+# pathname of the litmus test to check. The caller is expected to have
+# properly set up the LKMM environment variables.
+#
+# Copyright IBM Corporation, 2019
+#
+# Author: Paul E. McKenney <paulmck@linux.ibm.com>
+
+litmus=$1
+if test -f "$litmus" -a -r "$litmus"
+then
+ :
+else
+ echo ' !!! ' error: \"$litmus\" is not a readable file
+ exit 255
+fi
+
+if test -z "$LKMM_HW_MAP_FILE" -o ! -e $LKMM_DESTDIR/$litmus.out
+then
+ # LKMM run
+ herdoptions=${LKMM_HERD_OPTIONS--conf linux-kernel.cfg}
+ echo Herd options: $herdoptions > $LKMM_DESTDIR/$litmus.out
+ /usr/bin/time $LKMM_TIMEOUT_CMD herd7 $herdoptions $litmus >> $LKMM_DESTDIR/$litmus.out 2>&1
+ ret=$?
+ if test -z "$LKMM_HW_MAP_FILE"
+ then
+ exit $ret
+ fi
+ echo " --- " Automatically generated LKMM output for '"'--hw $LKMM_HW_MAP_FILE'"' run
+fi
+
+# Hardware run
+
+T=/tmp/checklitmushw.sh.$$
+trap 'rm -rf $T' 0 2
+mkdir $T
+
+# Generate filenames
+mapfile="Linux2${LKMM_HW_MAP_FILE}.map"
+themefile="$T/${LKMM_HW_MAP_FILE}.theme"
+herdoptions="-model $LKMM_HW_CAT_FILE"
+hwlitmus=`echo $litmus | sed -e 's/\.litmus$/.litmus.'${LKMM_HW_MAP_FILE}'/'`
+hwlitmusfile=`echo $hwlitmus | sed -e 's,^.*/,,'`
+
+# Don't run on litmus tests with complex synchronization
+if ! scripts/simpletest.sh $litmus
+then
+ echo ' --- ' error: \"$litmus\" contains locking, RCU, or SRCU
+ exit 254
+fi
+
+# Generate the assembly code and run herd7 on it.
+gen_theme7 -n 10 -map $mapfile -call Linux.call > $themefile
+jingle7 -v -theme $themefile $litmus > $LKMM_DESTDIR/$hwlitmus 2> $T/$hwlitmusfile.jingle7.out
+if grep -q "Generated 0 tests" $T/$hwlitmusfile.jingle7.out
+then
+ echo ' !!! ' jingle7 failed, errors in $hwlitmus.err
+ cp $T/$hwlitmusfile.jingle7.out $LKMM_DESTDIR/$hwlitmus.err
+ exit 253
+fi
+/usr/bin/time $LKMM_TIMEOUT_CMD herd7 -unroll 0 $LKMM_DESTDIR/$hwlitmus > $LKMM_DESTDIR/$hwlitmus.out 2>&1
+
+exit $?
diff --git a/tools/memory-model/scripts/runlitmushist.sh b/tools/memory-model/scripts/runlitmushist.sh
index 6ed376f495bb4..c6c2bdc67a502 100755
--- a/tools/memory-model/scripts/runlitmushist.sh
+++ b/tools/memory-model/scripts/runlitmushist.sh
@@ -13,7 +13,9 @@
#
# Copyright IBM Corporation, 2018
#
-# Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+# Author: Paul E. McKenney <paulmck@linux.ibm.com>
+
+. scripts/hwfnseg.sh
T=/tmp/runlitmushist.sh.$$
trap 'rm -rf $T' 0
@@ -30,15 +32,12 @@ fi
# Prefixes for per-CPU scripts
for ((i=0;i<$LKMM_JOBS;i++))
do
- echo dir="$LKMM_DESTDIR" > $T/$i.sh
echo T=$T >> $T/$i.sh
- echo herdoptions=\"$LKMM_HERD_OPTIONS\" >> $T/$i.sh
cat << '___EOF___' >> $T/$i.sh
runtest () {
- echo ' ... ' /usr/bin/time $LKMM_TIMEOUT_CMD herd7 $herdoptions $1 '>' $dir/$1.out '2>&1'
- if /usr/bin/time $LKMM_TIMEOUT_CMD herd7 $herdoptions $1 > $dir/$1.out 2>&1
+ if scripts/runlitmus.sh $1
then
- if ! grep -q '^Observation ' $dir/$1.out
+ if ! grep -q '^Observation ' $LKMM_DESTDIR/$1$2.out
then
echo ' !!! Herd failed, no Observation:' $1
fi
@@ -47,10 +46,16 @@ do
if test "$exitcode" -eq 124
then
exitmsg="timed out"
+ elif test "$exitcode" -eq 253
+ then
+ exitmsg=
else
exitmsg="failed, exit code $exitcode"
fi
- echo ' !!! Herd' ${exitmsg}: $1
+ if test -n "$exitmsg"
+ then
+ echo ' !!! Herd' ${exitmsg}: $1
+ fi
fi
}
___EOF___
@@ -59,11 +64,13 @@ done
awk -v q="'" -v b='\\' '
{
print "echo `grep " q "^P[0-9]" b "+(" q " " $0 " | tail -1 | sed -e " q "s/^P" b "([0-9]" b "+" b ")(.*$/" b "1/" q "` " $0
-}' | bash |
-sort -k1n |
-awk -v ncpu=$LKMM_JOBS -v t=$T '
+}' | sh | sort -k1n |
+awk -v dq='"' -v hwfnseg="$hwfnseg" -v ncpu="$LKMM_JOBS" -v t="$T" '
{
- print "runtest " $2 >> t "/" NR % ncpu ".sh";
+ print "if test -z " dq hwfnseg dq " || scripts/simpletest.sh " dq $2 dq
+ print "then"
+ print "\techo runtest " dq $2 dq " " hwfnseg " >> " t "/" NR % ncpu ".sh";
+ print "fi"
}
END {
diff --git a/tools/memory-model/scripts/simpletest.sh b/tools/memory-model/scripts/simpletest.sh
new file mode 100755
index 0000000000000..7edc5d3616657
--- /dev/null
+++ b/tools/memory-model/scripts/simpletest.sh
@@ -0,0 +1,35 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0+
+#
+# Give zero status if this is a simple test and non-zero otherwise.
+# Simple tests do not contain locking, RCU, or SRCU.
+#
+# Usage:
+# simpletest.sh file.litmus
+#
+# Copyright IBM Corporation, 2019
+#
+# Author: Paul E. McKenney <paulmck@linux.ibm.com>
+
+
+litmus=$1
+
+if test -f "$litmus" -a -r "$litmus"
+then
+ :
+else
+ echo ' --- ' error: \"$litmus\" is not a readable file
+ exit 255
+fi
+exclude="^[[:space:]]*\("
+exclude="${exclude}spin_lock(\|spin_unlock(\|spin_trylock(\|spin_is_locked("
+exclude="${exclude}\|rcu_read_lock(\|rcu_read_unlock("
+exclude="${exclude}\|synchronize_rcu(\|synchronize_rcu_expedited("
+exclude="${exclude}\|srcu_read_lock(\|srcu_read_unlock("
+exclude="${exclude}\|synchronize_srcu(\|synchronize_srcu_expedited("
+exclude="${exclude}\)"
+if grep -q $exclude $litmus
+then
+ exit 255
+fi
+exit 0
diff --git a/tools/rcu/rcu-cbs.py b/tools/rcu/rcu-cbs.py
new file mode 100644
index 0000000000000..f8b461b9eaa71
--- /dev/null
+++ b/tools/rcu/rcu-cbs.py
@@ -0,0 +1,46 @@
+#!/usr/bin/env drgn
+# SPDX-License-Identifier: GPL-2.0+
+#
+# Dump out the number of RCU callbacks outstanding.
+#
+# On older kernels having multiple flavors of RCU, this dumps out the
+# number of callbacks for the most heavily used flavor.
+#
+# Usage: sudo drgn rcu-cbs.py
+#
+# Copyright (C) 2021 Facebook, Inc.
+#
+# Authors: Paul E. McKenney <paulmck@kernel.org>
+
+import sys
+import drgn
+from drgn import NULL, Object
+from drgn.helpers.linux import *
+
+def get_rdp0(prog):
+ try:
+ rdp0 = prog.variable('rcu_preempt_data', 'kernel/rcu/tree.c');
+ except LookupError:
+ rdp0 = NULL;
+
+ if rdp0 == NULL:
+ try:
+ rdp0 = prog.variable('rcu_sched_data',
+ 'kernel/rcu/tree.c');
+ except LookupError:
+ rdp0 = NULL;
+
+ if rdp0 == NULL:
+ rdp0 = prog.variable('rcu_data', 'kernel/rcu/tree.c');
+ return rdp0.address_of_();
+
+rdp0 = get_rdp0(prog);
+
+# Sum up RCU callbacks.
+sum = 0;
+for cpu in for_each_possible_cpu(prog):
+ rdp = per_cpu_ptr(rdp0, cpu);
+ len = rdp.cblist.len.value_();
+ # print("CPU " + str(cpu) + " RCU callbacks: " + str(len));
+ sum += len;
+print("Number of RCU callbacks in flight: " + str(sum));
diff --git a/tools/testing/selftests/rcutorture/bin/kcsan-collapse.sh b/tools/testing/selftests/rcutorture/bin/kcsan-collapse.sh
index e5cc6b2f195eb..1af5d6b86b392 100755
--- a/tools/testing/selftests/rcutorture/bin/kcsan-collapse.sh
+++ b/tools/testing/selftests/rcutorture/bin/kcsan-collapse.sh
@@ -14,7 +14,7 @@ if test -z "$TORTURE_KCONFIG_KCSAN_ARG"
then
exit 0
fi
-cat $1/*/console.log |
+find $1 -name console.log -exec cat {} \; |
grep "BUG: KCSAN: " |
sed -e 's/^\[[^]]*] //' |
sort |
diff --git a/tools/testing/selftests/rcutorture/bin/kvm-again.sh b/tools/testing/selftests/rcutorture/bin/kvm-again.sh
index 46e47a00a7db4..d8c8483c46f13 100755
--- a/tools/testing/selftests/rcutorture/bin/kvm-again.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm-again.sh
@@ -29,7 +29,7 @@ then
echo "Usage: $scriptname /path/to/old/run [ options ]"
exit 1
fi
-if ! cp "$oldrun/batches" $T/batches.oldrun
+if ! cp "$oldrun/scenarios" $T/scenarios.oldrun
then
# Later on, can reconstitute this from console.log files.
echo Prior run batches file does not exist: $oldrun/batches
@@ -143,6 +143,8 @@ then
usage
fi
rm -f "$rundir"/*/{console.log,console.log.diags,qemu_pid,qemu-retval,Warnings,kvm-test-1-run.sh.out,kvm-test-1-run-qemu.sh.out,vmlinux} "$rundir"/log
+touch "$rundir/log"
+echo $scriptname $args | tee -a "$rundir/log"
echo $oldrun > "$rundir/re-run"
if ! test -d "$rundir/../../bin"
then
@@ -165,22 +167,12 @@ done
grep '^#' $i | sed -e 's/^# //' > $T/qemu-cmd-settings
. $T/qemu-cmd-settings
-grep -v '^#' $T/batches.oldrun | awk '
-BEGIN {
- oldbatch = 1;
-}
-
+grep -v '^#' $T/scenarios.oldrun | awk '
{
- if (oldbatch != $1) {
- print "kvm-test-1-run-batch.sh" curbatch;
- curbatch = "";
- oldbatch = $1;
- }
- curbatch = curbatch " " $2;
-}
-
-END {
- print "kvm-test-1-run-batch.sh" curbatch
+ curbatch = "";
+ for (i = 2; i <= NF; i++)
+ curbatch = curbatch " " $i;
+ print "kvm-test-1-run-batch.sh" curbatch;
}' > $T/runbatches.sh
if test -n "$dryrun"
@@ -188,12 +180,5 @@ then
echo ---- Dryrun complete, directory: $rundir | tee -a "$rundir/log"
else
( cd "$rundir"; sh $T/runbatches.sh )
- kcsan-collapse.sh "$rundir" | tee -a "$rundir/log"
- echo | tee -a "$rundir/log"
- echo ---- Results directory: $rundir | tee -a "$rundir/log"
- kvm-recheck.sh "$rundir" > $T/kvm-recheck.sh.out 2>&1
- ret=$?
- cat $T/kvm-recheck.sh.out | tee -a "$rundir/log"
- echo " --- Done at `date` (`get_starttime_duration $starttime`) exitcode $ret" | tee -a "$rundir/log"
- exit $ret
+ kvm-end-run-stats.sh "$rundir" "$starttime"
fi
diff --git a/tools/testing/selftests/rcutorture/bin/kvm-build.sh b/tools/testing/selftests/rcutorture/bin/kvm-build.sh
index 115e1822b26f2..5ad973dca8207 100755
--- a/tools/testing/selftests/rcutorture/bin/kvm-build.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm-build.sh
@@ -40,8 +40,10 @@ if test $retval -gt 1
then
exit 2
fi
-ncpus=`cpus2use.sh`
-make -j$ncpus $TORTURE_KMAKE_ARG > $resdir/Make.out 2>&1
+
+# Tell "make" to use double the number of real CPUs on the build system.
+ncpus="`getconf _NPROCESSORS_ONLN`"
+make -j$((2 * ncpus)) $TORTURE_KMAKE_ARG > $resdir/Make.out 2>&1
retval=$?
if test $retval -ne 0 || grep "rcu[^/]*": < $resdir/Make.out | egrep -q "Stop|Error|error:|warning:" || egrep -q "Stop|Error|error:" < $resdir/Make.out
then
diff --git a/tools/testing/selftests/rcutorture/bin/kvm-end-run-stats.sh b/tools/testing/selftests/rcutorture/bin/kvm-end-run-stats.sh
new file mode 100755
index 0000000000000..e4a00779b8c69
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/bin/kvm-end-run-stats.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0+
+#
+# Check the status of the specified run.
+#
+# Usage: kvm-end-run-stats.sh /path/to/run starttime
+#
+# Copyright (C) 2021 Facebook, Inc.
+#
+# Authors: Paul E. McKenney <paulmck@kernel.org>
+
+# scriptname=$0
+# args="$*"
+rundir="$1"
+if ! test -d "$rundir"
+then
+ echo kvm-end-run-stats.sh: Specified run directory does not exist: $rundir
+ exit 1
+fi
+
+T=${TMPDIR-/tmp}/kvm-end-run-stats.sh.$$
+trap 'rm -rf $T' 0
+mkdir $T
+
+KVM="`pwd`/tools/testing/selftests/rcutorture"; export KVM
+PATH=${KVM}/bin:$PATH; export PATH
+. functions.sh
+default_starttime="`get_starttime`"
+starttime="${2-default_starttime}"
+
+echo | tee -a "$rundir/log"
+echo | tee -a "$rundir/log"
+echo " --- `date` Test summary:" | tee -a "$rundir/log"
+echo Results directory: $rundir | tee -a "$rundir/log"
+kcsan-collapse.sh "$rundir" | tee -a "$rundir/log"
+kvm-recheck.sh "$rundir" > $T/kvm-recheck.sh.out 2>&1
+ret=$?
+cat $T/kvm-recheck.sh.out | tee -a "$rundir/log"
+echo " --- Done at `date` (`get_starttime_duration $starttime`) exitcode $ret" | tee -a "$rundir/log"
+exit $ret
diff --git a/tools/testing/selftests/rcutorture/bin/kvm-find-errors.sh b/tools/testing/selftests/rcutorture/bin/kvm-find-errors.sh
index 0670841122d8a..daf64b5070387 100755
--- a/tools/testing/selftests/rcutorture/bin/kvm-find-errors.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm-find-errors.sh
@@ -43,7 +43,7 @@ then
else
echo No build errors.
fi
-if grep -q -e "--buildonly" < ${rundir}/log
+if grep -q -e "--build-\?only" < ${rundir}/log && ! test -f "${rundir}/remote-log"
then
echo Build-only run, no console logs to check.
exit $editorret
diff --git a/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcu.sh b/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcu.sh
index 1706cd4466b42..fbdf162b6acdf 100755
--- a/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcu.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcu.sh
@@ -31,7 +31,7 @@ then
echo "$configfile ------- " $stopstate
else
title="$configfile ------- $ngps GPs"
- dur=`sed -e 's/^.* rcutorture.shutdown_secs=//' -e 's/ .*$//' < $i/qemu-cmd 2> /dev/null`
+ dur=`grep -v '^#' $i/qemu-cmd | sed -e 's/^.* rcutorture.shutdown_secs=//' -e 's/ .*$//'`
if test -z "$dur"
then
:
diff --git a/tools/testing/selftests/rcutorture/bin/kvm-remote.sh b/tools/testing/selftests/rcutorture/bin/kvm-remote.sh
new file mode 100755
index 0000000000000..79e680e0e7bf3
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/bin/kvm-remote.sh
@@ -0,0 +1,249 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0+
+#
+# Run a series of tests on remote systems under KVM.
+#
+# Usage: kvm-remote.sh "systems" [ <kvm.sh args> ]
+# kvm-remote.sh "systems" /path/to/old/run [ <kvm-again.sh args> ]
+#
+# Copyright (C) 2021 Facebook, Inc.
+#
+# Authors: Paul E. McKenney <paulmck@kernel.org>
+
+scriptname=$0
+args="$*"
+
+if ! test -d tools/testing/selftests/rcutorture/bin
+then
+ echo $scriptname must be run from top-level directory of kernel source tree.
+ exit 1
+fi
+
+KVM="`pwd`/tools/testing/selftests/rcutorture"; export KVM
+PATH=${KVM}/bin:$PATH; export PATH
+. functions.sh
+
+starttime="`get_starttime`"
+
+systems="$1"
+if test -z "$systems"
+then
+ echo $scriptname: Empty list of systems will go nowhere good, giving up.
+ exit 1
+fi
+shift
+
+# Pathnames:
+# T: /tmp/kvm-remote.sh.$$
+# resdir: /tmp/kvm-remote.sh.$$/res
+# rundir: /tmp/kvm-remote.sh.$$/res/$ds ("-remote" suffix)
+# oldrun: `pwd`/tools/testing/.../res/$otherds
+#
+# Pathname segments:
+# TD: kvm-remote.sh.$$
+# ds: yyyy.mm.dd-hh.mm.ss-remote
+
+TD=kvm-remote.sh.$$
+T=${TMPDIR-/tmp}/$TD
+trap 'rm -rf $T' 0
+mkdir $T
+
+resdir="$T/res"
+ds=`date +%Y.%m.%d-%H.%M.%S`-remote
+rundir=$resdir/$ds
+echo Results directory: $rundir
+echo $scriptname $args
+if echo $1 | grep -q '^--'
+then
+ # Fresh build. Create a datestamp unless the caller supplied one.
+ datestamp="`echo "$@" | awk -v ds="$ds" '{
+ for (i = 1; i < NF; i++) {
+ if ($i == "--datestamp") {
+ ds = "";
+ break;
+ }
+ }
+ if (ds != "")
+ print "--datestamp " ds;
+ }'`"
+ kvm.sh --remote "$@" $datestamp --buildonly > $T/kvm.sh.out 2>&1
+ ret=$?
+ if test "$ret" -ne 0
+ then
+ echo $scriptname: kvm.sh failed exit code $?
+ cat $T/kvm.sh.out
+ exit 2
+ fi
+ oldrun="`grep -m 1 "^Results directory: " $T/kvm.sh.out | awk '{ print $3 }'`"
+ touch "$oldrun/remote-log"
+ echo $scriptname $args >> "$oldrun/remote-log"
+ echo | tee -a "$oldrun/remote-log"
+ echo " ----" kvm.sh output: "(`date`)" | tee -a "$oldrun/remote-log"
+ cat $T/kvm.sh.out | tee -a "$oldrun/remote-log"
+ # We are going to run this, so remove the buildonly files.
+ rm -f "$oldrun"/*/buildonly
+ kvm-again.sh $oldrun --dryrun --remote --rundir "$rundir" > $T/kvm-again.sh.out 2>&1
+ ret=$?
+ if test "$ret" -ne 0
+ then
+ echo $scriptname: kvm-again.sh failed exit code $? | tee -a "$oldrun/remote-log"
+ cat $T/kvm-again.sh.out | tee -a "$oldrun/remote-log"
+ exit 2
+ fi
+else
+ # Re-use old run.
+ oldrun="$1"
+ if ! echo $oldrun | grep -q '^/'
+ then
+ oldrun="`pwd`/$oldrun"
+ fi
+ shift
+ touch "$oldrun/remote-log"
+ echo $scriptname $args >> "$oldrun/remote-log"
+ kvm-again.sh "$oldrun" "$@" --dryrun --remote --rundir "$rundir" > $T/kvm-again.sh.out 2>&1
+ ret=$?
+ if test "$ret" -ne 0
+ then
+ echo $scriptname: kvm-again.sh failed exit code $? | tee -a "$oldrun/remote-log"
+ cat $T/kvm-again.sh.out | tee -a "$oldrun/remote-log"
+ exit 2
+ fi
+ cp -a "$rundir" "$KVM/res/"
+ oldrun="$KVM/res/$ds"
+fi
+echo | tee -a "$oldrun/remote-log"
+echo " ----" kvm-again.sh output: "(`date`)" | tee -a "$oldrun/remote-log"
+cat $T/kvm-again.sh.out
+echo | tee -a "$oldrun/remote-log"
+echo Remote run directory: $rundir | tee -a "$oldrun/remote-log"
+echo Local build-side run directory: $oldrun | tee -a "$oldrun/remote-log"
+
+# Create the kvm-remote-N.sh scripts in the bin directory.
+awk < "$rundir"/scenarios -v dest="$T/bin" -v rundir="$rundir" '
+{
+ n = $1;
+ sub(/\./, "", n);
+ fn = dest "/kvm-remote-" n ".sh"
+ scenarios = "";
+ for (i = 2; i <= NF; i++)
+ scenarios = scenarios " " $i;
+ print "kvm-test-1-run-batch.sh" scenarios > fn;
+ print "rm " rundir "/remote.run" >> fn;
+}'
+chmod +x $T/bin/kvm-remote-*.sh
+( cd "`dirname $T`"; tar -chzf $T/binres.tgz "$TD/bin" "$TD/res" )
+
+# Check first to avoid the need for cleanup for system-name typos
+for i in $systems
+do
+ ncpus="`ssh $i getconf _NPROCESSORS_ONLN 2> /dev/null`"
+ echo $i: $ncpus CPUs " " `date` | tee -a "$oldrun/remote-log"
+ ret=$?
+ if test "$ret" -ne 0
+ then
+ echo System $i unreachable, giving up. | tee -a "$oldrun/remote-log"
+ exit 4 | tee -a "$oldrun/remote-log"
+ fi
+done
+
+# Download and expand the tarball on all systems.
+for i in $systems
+do
+ echo Downloading tarball to $i `date` | tee -a "$oldrun/remote-log"
+ cat $T/binres.tgz | ssh $i "cd /tmp; tar -xzf -"
+ ret=$?
+ if test "$ret" -ne 0
+ then
+ echo Unable to download $T/binres.tgz to system $i, giving up. | tee -a "$oldrun/remote-log"
+ exit 10 | tee -a "$oldrun/remote-log"
+ fi
+done
+
+# Function to check for presence of a file on the specified system.
+# Complain if the system cannot be reached, and retry after a wait.
+# Currently just waits forever if a machine disappears.
+#
+# Usage: checkremotefile system pathname
+checkremotefile () {
+ local ret
+ local sleeptime=60
+
+ while :
+ do
+ ssh $1 "test -f \"$2\""
+ ret=$?
+ if test "$ret" -ne 255
+ then
+ return $ret
+ fi
+ echo " ---" ssh failure to $1 checking for file $2, retry after $sleeptime seconds. `date`
+ sleep $sleeptime
+ done
+}
+
+# Function to start batches on idle remote $systems
+#
+# Usage: startbatches curbatch nbatches
+#
+# Batches are numbered starting at 1. Returns the next batch to start.
+# Be careful to redirect all debug output to FD 2 (stderr).
+startbatches () {
+ local curbatch="$1"
+ local nbatches="$2"
+ local ret
+
+ # Each pass through the following loop examines one system.
+ for i in $systems
+ do
+ if test "$curbatch" -gt "$nbatches"
+ then
+ echo $((nbatches + 1))
+ return 0
+ fi
+ if checkremotefile "$i" "$resdir/$ds/remote.run" 1>&2
+ then
+ continue # System still running last test, skip.
+ fi
+ ssh "$i" "cd \"$resdir/$ds\"; touch remote.run; PATH=\"$T/bin:$PATH\" nohup kvm-remote-$curbatch.sh > kvm-remote-$curbatch.sh.out 2>&1 &" 1>&2
+ ret=$?
+ if test "$ret" -ne 0
+ then
+ echo ssh $i failed: exitcode $ret 1>&2
+ exit 11
+ fi
+ echo " ----" System $i Batch `head -n $curbatch < "$rundir"/scenarios | tail -1` `date` 1>&2
+ curbatch=$((curbatch + 1))
+ done
+ echo $curbatch
+}
+
+# Launch all the scenarios.
+nbatches="`wc -l "$rundir"/scenarios | awk '{ print $1 }'`"
+curbatch=1
+while test "$curbatch" -le "$nbatches"
+do
+ startbatches $curbatch $nbatches > $T/curbatch 2> $T/startbatches.stderr
+ curbatch="`cat $T/curbatch`"
+ if test -s "$T/startbatches.stderr"
+ then
+ cat "$T/startbatches.stderr" | tee -a "$oldrun/remote-log"
+ fi
+ if test "$curbatch" -le "$nbatches"
+ then
+ sleep 30
+ fi
+done
+echo All batches started. `date`
+
+# Wait for all remaining scenarios to complete and collect results.
+for i in $systems
+do
+ while checkremotefile "$i" "$resdir/$ds/remote.run"
+ do
+ sleep 30
+ done
+ ( cd "$oldrun"; ssh $i "cd $rundir; tar -czf - kvm-remote-*.sh.out */console.log */kvm-test-1-run*.sh.out */qemu_pid */qemu-retval; rm -rf $T > /dev/null 2>&1" | tar -xzf - )
+done
+
+( kvm-end-run-stats.sh "$oldrun" "$starttime"; echo $? > $T/exitcode ) | tee -a "$oldrun/remote-log"
+exit "`cat $T/exitcode`"
diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh
index 6bf00a003d3d1..b4ac4ee332226 100755
--- a/tools/testing/selftests/rcutorture/bin/kvm.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm.sh
@@ -20,6 +20,9 @@ mkdir $T
cd `dirname $scriptname`/../../../../../
+# This script knows only English.
+LANG=en_US.UTF-8; export LANG
+
dur=$((30*60))
dryrun=""
KVM="`pwd`/tools/testing/selftests/rcutorture"; export KVM
@@ -41,6 +44,7 @@ TORTURE_KCONFIG_KASAN_ARG=""
TORTURE_KCONFIG_KCSAN_ARG=""
TORTURE_KMAKE_ARG=""
TORTURE_QEMU_MEM=512
+TORTURE_REMOTE=
TORTURE_SHUTDOWN_GRACE=180
TORTURE_SUITE=rcu
TORTURE_MOD=rcutorture
@@ -64,7 +68,7 @@ usage () {
echo " --cpus N"
echo " --datestamp string"
echo " --defconfig string"
- echo " --dryrun batches|sched|script"
+ echo " --dryrun batches|scenarios|sched|script"
echo " --duration minutes | <seconds>s | <hours>h | <days>d"
echo " --gdb"
echo " --help"
@@ -77,6 +81,7 @@ usage () {
echo " --no-initrd"
echo " --qemu-args qemu-arguments"
echo " --qemu-cmd qemu-system-..."
+ echo " --remote"
echo " --results absolute-pathname"
echo " --torture lock|rcu|rcuscale|refscale|scf"
echo " --trust-make"
@@ -112,10 +117,13 @@ do
checkarg --cpus "(number)" "$#" "$2" '^[0-9]*$' '^--'
cpus=$2
TORTURE_ALLOTED_CPUS="$2"
- max_cpus="`identify_qemu_vcpus`"
- if test "$TORTURE_ALLOTED_CPUS" -gt "$max_cpus"
+ if test -z "$TORTURE_REMOTE"
then
- TORTURE_ALLOTED_CPUS=$max_cpus
+ max_cpus="`identify_qemu_vcpus`"
+ if test "$TORTURE_ALLOTED_CPUS" -gt "$max_cpus"
+ then
+ TORTURE_ALLOTED_CPUS=$max_cpus
+ fi
fi
shift
;;
@@ -130,7 +138,7 @@ do
shift
;;
--dryrun)
- checkarg --dryrun "batches|sched|script" $# "$2" 'batches\|sched\|script' '^--'
+ checkarg --dryrun "batches|sched|script" $# "$2" 'batches\|scenarios\|sched\|script' '^--'
dryrun=$2
shift
;;
@@ -206,6 +214,9 @@ do
TORTURE_QEMU_CMD="$2"
shift
;;
+ --remote)
+ TORTURE_REMOTE=1
+ ;;
--results)
checkarg --results "(absolute pathname)" "$#" "$2" '^/' '^error'
resdir=$2
@@ -550,20 +561,7 @@ END {
if (ncpus != 0)
dump(first, i, batchnum);
}' >> $T/script
-
-cat << '___EOF___' >> $T/script
-echo | tee -a $TORTURE_RESDIR/log
-echo | tee -a $TORTURE_RESDIR/log
-echo " --- `date` Test summary:" | tee -a $TORTURE_RESDIR/log
-___EOF___
-cat << ___EOF___ >> $T/script
-echo Results directory: $resdir/$ds | tee -a $resdir/$ds/log
-kcsan-collapse.sh $resdir/$ds | tee -a $resdir/$ds/log
-kvm-recheck.sh $resdir/$ds > $T/kvm-recheck.sh.out 2>&1
-___EOF___
-echo 'ret=$?' >> $T/script
-echo "cat $T/kvm-recheck.sh.out | tee -a $resdir/$ds/log" >> $T/script
-echo 'exit $ret' >> $T/script
+echo kvm-end-run-stats.sh "$resdir/$ds" "$starttime" >> $T/script
# Extract the tests and their batches from the script.
egrep 'Start batch|Starting build\.' $T/script | grep -v ">>" |
@@ -577,6 +575,25 @@ egrep 'Start batch|Starting build\.' $T/script | grep -v ">>" |
print batchno, $1, $2
}' > $T/batches
+# As above, but one line per batch.
+grep -v '^#' $T/batches | awk '
+BEGIN {
+ oldbatch = 1;
+}
+
+{
+ if (oldbatch != $1) {
+ print ++n ". " curbatch;
+ curbatch = "";
+ oldbatch = $1;
+ }
+ curbatch = curbatch " " $2;
+}
+
+END {
+ print ++n ". " curbatch;
+}' > $T/scenarios
+
if test "$dryrun" = script
then
cat $T/script
@@ -597,13 +614,17 @@ elif test "$dryrun" = batches
then
cat $T/batches
exit 0
+elif test "$dryrun" = scenarios
+then
+ cat $T/scenarios
+ exit 0
else
# Not a dryrun. Record the batches and the number of CPUs, then run the script.
bash $T/script
ret=$?
cp $T/batches $resdir/$ds/batches
+ cp $T/scenarios $resdir/$ds/scenarios
echo '#' cpus=$cpus >> $resdir/$ds/batches
- echo " --- Done at `date` (`get_starttime_duration $starttime`) exitcode $ret" | tee -a $resdir/$ds/log
exit $ret
fi
diff --git a/tools/testing/selftests/rcutorture/bin/torture.sh b/tools/testing/selftests/rcutorture/bin/torture.sh
index 56e2e1a425699..363f56081eff3 100755
--- a/tools/testing/selftests/rcutorture/bin/torture.sh
+++ b/tools/testing/selftests/rcutorture/bin/torture.sh
@@ -53,6 +53,7 @@ do_refscale=yes
do_kvfree=yes
do_kasan=yes
do_kcsan=no
+do_clocksourcewd=yes
# doyesno - Helper function for yes/no arguments
function doyesno () {
@@ -72,6 +73,7 @@ usage () {
echo " --configs-scftorture \"config-file list w/ repeat factor (2*CFLIST)\""
echo " --doall"
echo " --doallmodconfig / --do-no-allmodconfig"
+ echo " --do-clocksourcewd / --do-no-clocksourcewd"
echo " --do-kasan / --do-no-kasan"
echo " --do-kcsan / --do-no-kcsan"
echo " --do-kvfree / --do-no-kvfree"
@@ -109,7 +111,7 @@ do
configs_scftorture="$configs_scftorture $2"
shift
;;
- --doall)
+ --do-all|--doall)
do_allmodconfig=yes
do_rcutorture=yes
do_locktorture=yes
@@ -119,10 +121,14 @@ do
do_kvfree=yes
do_kasan=yes
do_kcsan=yes
+ do_clocksourcewd=yes
;;
--do-allmodconfig|--do-no-allmodconfig)
do_allmodconfig=`doyesno "$1" --do-allmodconfig`
;;
+ --do-clocksourcewd|--do-no-clocksourcewd)
+ do_clocksourcewd=`doyesno "$1" --do-clocksourcewd`
+ ;;
--do-kasan|--do-no-kasan)
do_kasan=`doyesno "$1" --do-kasan`
;;
@@ -135,7 +141,7 @@ do
--do-locktorture|--do-no-locktorture)
do_locktorture=`doyesno "$1" --do-locktorture`
;;
- --do-none)
+ --do-none|--donone)
do_allmodconfig=no
do_rcutorture=no
do_locktorture=no
@@ -145,6 +151,7 @@ do
do_kvfree=no
do_kasan=no
do_kcsan=no
+ do_clocksourcewd=no
;;
--do-rcuscale|--do-no-rcuscale)
do_rcuscale=`doyesno "$1" --do-rcuscale`
@@ -279,9 +286,9 @@ function torture_one {
# torture_bootargs="[ kernel boot arguments ]"
# torture_set flavor [ kvm.sh arguments ]
#
-# Note that "flavor" is an arbitrary string. Supply --torture if needed.
-# Note that quoting is problematic. So on the command line, pass multiple
-# values with multiple kvm.sh argument instances.
+# Note that "flavor" is an arbitrary string that does not affect kvm.sh
+# in any way. So also supply --torture if you need something other than
+# the default.
function torture_set {
local cur_kcsan_kmake_args=
local kcsan_kmake_tag=
@@ -302,7 +309,7 @@ function torture_set {
kcsan_kmake_tag="--kmake-args"
cur_kcsan_kmake_args="$kcsan_kmake_args"
fi
- torture_one $* --kconfig "CONFIG_DEBUG_LOCK_ALLOC=y CONFIG_PROVE_LOCKING=y" $kcsan_kmake_tag $cur_kcsan_kmake_args --kcsan
+ torture_one "$@" --kconfig "CONFIG_DEBUG_LOCK_ALLOC=y CONFIG_PROVE_LOCKING=y" $kcsan_kmake_tag $cur_kcsan_kmake_args --kcsan
fi
}
@@ -377,6 +384,22 @@ then
torture_set "rcuscale-kvfree" tools/testing/selftests/rcutorture/bin/kvm.sh --torture rcuscale --allcpus --duration 10 --kconfig "CONFIG_NR_CPUS=$HALF_ALLOTED_CPUS" --memory 1G --trust-make
fi
+if test "$do_clocksourcewd" = "yes"
+then
+ torture_bootargs="rcupdate.rcu_cpu_stall_suppress_at_boot=1 torture.disable_onoff_at_boot rcupdate.rcu_task_stall_timeout=30000"
+ torture_set "clocksourcewd-1" tools/testing/selftests/rcutorture/bin/kvm.sh --allcpus --duration 45s --configs TREE03 --kconfig "CONFIG_TEST_CLOCKSOURCE_WATCHDOG=y" --trust-make
+
+ torture_bootargs="rcupdate.rcu_cpu_stall_suppress_at_boot=1 torture.disable_onoff_at_boot rcupdate.rcu_task_stall_timeout=30000 clocksource.max_cswd_read_retries=1"
+ torture_set "clocksourcewd-2" tools/testing/selftests/rcutorture/bin/kvm.sh --allcpus --duration 45s --configs TREE03 --kconfig "CONFIG_TEST_CLOCKSOURCE_WATCHDOG=y" --trust-make
+
+ # In case our work is already done...
+ if test "$do_rcutorture" != "yes"
+ then
+ torture_bootargs="rcupdate.rcu_cpu_stall_suppress_at_boot=1 torture.disable_onoff_at_boot rcupdate.rcu_task_stall_timeout=30000"
+ torture_set "clocksourcewd-3" tools/testing/selftests/rcutorture/bin/kvm.sh --allcpus --duration 45s --configs TREE03 --trust-make
+ fi
+fi
+
echo " --- " $scriptname $args
echo " --- " Done `date` | tee -a $T/log
ret=0
@@ -395,6 +418,10 @@ then
nfailures="`wc -l "$T/failures" | awk '{ print $1 }'`"
ret=2
fi
+if test "$do_kcsan" = "yes"
+then
+ TORTURE_KCONFIG_KCSAN_ARG=1 tools/testing/selftests/rcutorture/bin/kcsan-collapse.sh tools/testing/selftests/rcutorture/res/$ds > tools/testing/selftests/rcutorture/res/$ds/kcsan.sum
+fi
echo Started at $startdate, ended at `date`, duration `get_starttime_duration $starttime`. | tee -a $T/log
echo Summary: Successes: $nsuccesses Failures: $nfailures. | tee -a $T/log
tdir="`cat $T/successes $T/failures | head -1 | awk '{ print $NF }' | sed -e 's,/[^/]\+/*$,,'`"
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/BUSTED-BOOST b/tools/testing/selftests/rcutorture/configs/rcu/BUSTED-BOOST
new file mode 100644
index 0000000000000..22d598f9cabe5
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/rcu/BUSTED-BOOST
@@ -0,0 +1,17 @@
+CONFIG_SMP=y
+CONFIG_NR_CPUS=16
+CONFIG_PREEMPT_NONE=n
+CONFIG_PREEMPT_VOLUNTARY=n
+CONFIG_PREEMPT=y
+#CHECK#CONFIG_PREEMPT_RCU=y
+CONFIG_HZ_PERIODIC=y
+CONFIG_NO_HZ_IDLE=n
+CONFIG_NO_HZ_FULL=n
+CONFIG_RCU_TRACE=y
+CONFIG_HOTPLUG_CPU=y
+CONFIG_RCU_FANOUT=2
+CONFIG_RCU_FANOUT_LEAF=2
+CONFIG_RCU_NOCB_CPU=n
+CONFIG_DEBUG_LOCK_ALLOC=n
+CONFIG_DEBUG_OBJECTS_RCU_HEAD=n
+CONFIG_RCU_EXPERT=y
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/BUSTED-BOOST.boot b/tools/testing/selftests/rcutorture/configs/rcu/BUSTED-BOOST.boot
new file mode 100644
index 0000000000000..f57720c52c0f9
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/rcu/BUSTED-BOOST.boot
@@ -0,0 +1,8 @@
+rcutorture.test_boost=2
+rcutorture.stutter=0
+rcutree.gp_preinit_delay=12
+rcutree.gp_init_delay=3
+rcutree.gp_cleanup_delay=3
+rcutree.kthread_prio=2
+threadirqs
+tree.use_softirq=0
diff --git a/tools/testing/selftests/rcutorture/configs/rcuscale/TREE b/tools/testing/selftests/rcutorture/configs/rcuscale/TREE
index 721cfda76ab23..4cc1cc5813214 100644
--- a/tools/testing/selftests/rcutorture/configs/rcuscale/TREE
+++ b/tools/testing/selftests/rcutorture/configs/rcuscale/TREE
@@ -7,7 +7,7 @@ CONFIG_HZ_PERIODIC=n
CONFIG_NO_HZ_IDLE=y
CONFIG_NO_HZ_FULL=n
CONFIG_RCU_FAST_NO_HZ=n
-CONFIG_HOTPLUG_CPU=n
+CONFIG_HOTPLUG_CPU=y
CONFIG_SUSPEND=n
CONFIG_HIBERNATION=n
CONFIG_RCU_NOCB_CPU=n
diff --git a/tools/testing/selftests/rcutorture/configs/rcuscale/TREE54 b/tools/testing/selftests/rcutorture/configs/rcuscale/TREE54
index 7629f5dd73b29..f5952061fde7b 100644
--- a/tools/testing/selftests/rcutorture/configs/rcuscale/TREE54
+++ b/tools/testing/selftests/rcutorture/configs/rcuscale/TREE54
@@ -8,7 +8,7 @@ CONFIG_HZ_PERIODIC=n
CONFIG_NO_HZ_IDLE=y
CONFIG_NO_HZ_FULL=n
CONFIG_RCU_FAST_NO_HZ=n
-CONFIG_HOTPLUG_CPU=n
+CONFIG_HOTPLUG_CPU=y
CONFIG_SUSPEND=n
CONFIG_HIBERNATION=n
CONFIG_RCU_FANOUT=3
diff --git a/tools/testing/selftests/rcutorture/configs/refscale/NOPREEMPT b/tools/testing/selftests/rcutorture/configs/refscale/NOPREEMPT
index 1cd25b7314e3e..ad505a887bec4 100644
--- a/tools/testing/selftests/rcutorture/configs/refscale/NOPREEMPT
+++ b/tools/testing/selftests/rcutorture/configs/refscale/NOPREEMPT
@@ -7,7 +7,7 @@ CONFIG_HZ_PERIODIC=n
CONFIG_NO_HZ_IDLE=y
CONFIG_NO_HZ_FULL=n
CONFIG_RCU_FAST_NO_HZ=n
-CONFIG_HOTPLUG_CPU=n
+CONFIG_HOTPLUG_CPU=y
CONFIG_SUSPEND=n
CONFIG_HIBERNATION=n
CONFIG_RCU_NOCB_CPU=n
diff --git a/tools/testing/selftests/rcutorture/configs/refscale/PREEMPT b/tools/testing/selftests/rcutorture/configs/refscale/PREEMPT
index d10bc694f42cd..4f08e641bb6b8 100644
--- a/tools/testing/selftests/rcutorture/configs/refscale/PREEMPT
+++ b/tools/testing/selftests/rcutorture/configs/refscale/PREEMPT
@@ -7,7 +7,7 @@ CONFIG_HZ_PERIODIC=n
CONFIG_NO_HZ_IDLE=y
CONFIG_NO_HZ_FULL=n
CONFIG_RCU_FAST_NO_HZ=n
-CONFIG_HOTPLUG_CPU=n
+CONFIG_HOTPLUG_CPU=y
CONFIG_SUSPEND=n
CONFIG_HIBERNATION=n
CONFIG_RCU_NOCB_CPU=n
diff --git a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/locks.h b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/locks.h
index cf6938d679d7f..1e24827f96f18 100644
--- a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/locks.h
+++ b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/locks.h
@@ -174,7 +174,7 @@ static inline bool spin_trylock(spinlock_t *lock)
}
struct completion {
- /* Hopefuly this won't overflow. */
+ /* Hopefully this won't overflow. */
unsigned int count;
};