aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorStephen Rothwell <sfr@canb.auug.org.au>2021-02-15 14:36:57 +1100
committerStephen Rothwell <sfr@canb.auug.org.au>2021-02-15 14:36:57 +1100
commit5e055bd14829caf08a45b8de223fbd6047539595 (patch)
treea79a4abf7e0eb28168ce21d4eb1f27fa5db79918
parent058d232f6e2ac8f64f7580eb46c1c98a9975d741 (diff)
parentf1b61f7b4fb971f281978fb905507e9ac9b2d973 (diff)
downloadlinux-next-5e055bd14829caf08a45b8de223fbd6047539595.tar.gz
Merge remote-tracking branch 'tip/auto-latest'
Notice: this object is not reachable from any branch.
Notice: this object is not reachable from any branch.
-rw-r--r--Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.rst4
-rw-r--r--Documentation/RCU/Design/Requirements/Requirements.rst732
-rw-r--r--Documentation/RCU/checklist.rst10
-rw-r--r--Documentation/RCU/rcubarrier.rst6
-rw-r--r--Documentation/RCU/stallwarn.rst27
-rw-r--r--Documentation/RCU/whatisRCU.rst10
-rw-r--r--Documentation/admin-guide/kernel-parameters.txt63
-rw-r--r--Documentation/devicetree/bindings/timer/nuvoton,npcm7xx-timer.txt3
-rw-r--r--Documentation/devicetree/bindings/timer/stericsson-u300-apptimer.txt18
-rw-r--r--Documentation/scheduler/schedutil.txt169
-rw-r--r--MAINTAINERS5
-rw-r--r--arch/Kconfig15
-rw-r--r--arch/ia64/include/asm/efi.h13
-rw-r--r--arch/ia64/include/asm/tlb.h4
-rw-r--r--arch/ia64/kernel/efi.c1
-rw-r--r--arch/ia64/kernel/machine_kexec.c1
-rw-r--r--arch/ia64/kernel/mca.c1
-rw-r--r--arch/ia64/kernel/smpboot.c1
-rw-r--r--arch/ia64/kernel/time.c1
-rw-r--r--arch/ia64/kernel/uncached.c4
-rw-r--r--arch/ia64/mm/contig.c1
-rw-r--r--arch/ia64/mm/discontig.c1
-rw-r--r--arch/ia64/mm/init.c1
-rw-r--r--arch/parisc/Kconfig1
-rw-r--r--arch/parisc/include/asm/hardirq.h4
-rw-r--r--arch/parisc/kernel/irq.c1
-rw-r--r--arch/powerpc/Kconfig1
-rw-r--r--arch/powerpc/include/asm/irq.h2
-rw-r--r--arch/powerpc/kernel/irq.c1
-rw-r--r--arch/powerpc/perf/core-book3s.c2
-rw-r--r--arch/powerpc/platforms/cell/spufs/sched.c2
-rw-r--r--arch/s390/Kconfig1
-rw-r--r--arch/s390/include/asm/hardirq.h1
-rw-r--r--arch/s390/kernel/irq.c1
-rw-r--r--arch/sh/Kconfig1
-rw-r--r--arch/sh/include/asm/irq.h1
-rw-r--r--arch/sh/kernel/irq.c1
-rw-r--r--arch/sparc/Kconfig1
-rw-r--r--arch/sparc/include/asm/irq_64.h1
-rw-r--r--arch/sparc/include/asm/tlb_64.h1
-rw-r--r--arch/sparc/kernel/irq_64.c1
-rw-r--r--arch/x86/Kconfig9
-rw-r--r--arch/x86/Makefile34
-rw-r--r--arch/x86/entry/common.c19
-rw-r--r--arch/x86/entry/entry_64.S67
-rw-r--r--arch/x86/events/core.c28
-rw-r--r--arch/x86/events/intel/core.c547
-rw-r--r--arch/x86/events/intel/ds.c131
-rw-r--r--arch/x86/events/intel/uncore.c58
-rw-r--r--arch/x86/events/intel/uncore.h5
-rw-r--r--arch/x86/events/intel/uncore_snb.c2
-rw-r--r--arch/x86/events/intel/uncore_snbep.c114
-rw-r--r--arch/x86/events/perf_event.h21
-rw-r--r--arch/x86/events/probe.c7
-rw-r--r--arch/x86/events/probe.h7
-rw-r--r--arch/x86/events/rapl.c34
-rw-r--r--arch/x86/include/asm/cpufeature.h7
-rw-r--r--arch/x86/include/asm/cpufeatures.h17
-rw-r--r--arch/x86/include/asm/disabled-features.h3
-rw-r--r--arch/x86/include/asm/efi.h46
-rw-r--r--arch/x86/include/asm/fpu/api.h12
-rw-r--r--arch/x86/include/asm/idtentry.h17
-rw-r--r--arch/x86/include/asm/insn.h45
-rw-r--r--arch/x86/include/asm/irq.h4
-rw-r--r--arch/x86/include/asm/irq_stack.h279
-rw-r--r--arch/x86/include/asm/irqflags.h46
-rw-r--r--arch/x86/include/asm/kprobes.h11
-rw-r--r--arch/x86/include/asm/mce.h22
-rw-r--r--arch/x86/include/asm/microcode.h2
-rw-r--r--arch/x86/include/asm/orc_types.h10
-rw-r--r--arch/x86/include/asm/page_64_types.h2
-rw-r--r--arch/x86/include/asm/paravirt.h30
-rw-r--r--arch/x86/include/asm/paravirt_types.h17
-rw-r--r--arch/x86/include/asm/perf_event.h24
-rw-r--r--arch/x86/include/asm/pgtable_types.h2
-rw-r--r--arch/x86/include/asm/preempt.h48
-rw-r--r--arch/x86/include/asm/processor.h9
-rw-r--r--arch/x86/include/asm/required-features.h3
-rw-r--r--arch/x86/include/asm/resctrl.h11
-rw-r--r--arch/x86/include/asm/softirq_stack.h11
-rw-r--r--arch/x86/include/asm/special_insns.h6
-rw-r--r--arch/x86/include/asm/static_call.h7
-rw-r--r--arch/x86/include/asm/thermal.h13
-rw-r--r--arch/x86/include/asm/tlb.h1
-rw-r--r--arch/x86/include/asm/unwind_hints.h13
-rw-r--r--arch/x86/include/asm/vm86.h1
-rw-r--r--arch/x86/include/uapi/asm/vm86.h4
-rw-r--r--arch/x86/kernel/acpi/Makefile1
-rw-r--r--arch/x86/kernel/acpi/wakeup_64.S4
-rw-r--r--arch/x86/kernel/apic/apic.c31
-rw-r--r--arch/x86/kernel/asm-offsets_64.c3
-rw-r--r--arch/x86/kernel/cpu/common.c7
-rw-r--r--arch/x86/kernel/cpu/intel.c3
-rw-r--r--arch/x86/kernel/cpu/mce/Makefile2
-rw-r--r--arch/x86/kernel/cpu/mce/core.c16
-rw-r--r--arch/x86/kernel/cpu/mce/intel.c1
-rw-r--r--arch/x86/kernel/cpu/microcode/core.c2
-rw-r--r--arch/x86/kernel/cpu/mtrr/cleanup.c4
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c1
-rw-r--r--arch/x86/kernel/cpu/mtrr/mtrr.c2
-rw-r--r--arch/x86/kernel/cpu/resctrl/internal.h1
-rw-r--r--arch/x86/kernel/cpu/resctrl/rdtgroup.c24
-rw-r--r--arch/x86/kernel/cpu/scattered.c5
-rw-r--r--arch/x86/kernel/cpu/sgx/driver.c3
-rw-r--r--arch/x86/kernel/cpu/sgx/encl.c13
-rw-r--r--arch/x86/kernel/cpu/sgx/main.c14
-rw-r--r--arch/x86/kernel/dumpstack_64.c22
-rw-r--r--arch/x86/kernel/fpu/xstate.c4
-rw-r--r--arch/x86/kernel/ftrace_64.S8
-rw-r--r--arch/x86/kernel/irq.c23
-rw-r--r--arch/x86/kernel/irq_32.c1
-rw-r--r--arch/x86/kernel/irq_64.c12
-rw-r--r--arch/x86/kernel/irqflags.S11
-rw-r--r--arch/x86/kernel/kprobes/core.c168
-rw-r--r--arch/x86/kernel/ldt.c10
-rw-r--r--arch/x86/kernel/module.c1
-rw-r--r--arch/x86/kernel/msr.c7
-rw-r--r--arch/x86/kernel/paravirt.c7
-rw-r--r--arch/x86/kernel/paravirt_patch.c10
-rw-r--r--arch/x86/kernel/pci-iommu_table.c3
-rw-r--r--arch/x86/kernel/process_64.c2
-rw-r--r--arch/x86/kernel/ptrace.c46
-rw-r--r--arch/x86/kernel/reboot.c9
-rw-r--r--arch/x86/kernel/static_call.c17
-rw-r--r--arch/x86/kernel/sys_x86_64.c8
-rw-r--r--arch/x86/kernel/unwind_orc.c5
-rw-r--r--arch/x86/kernel/vm86_32.c62
-rw-r--r--arch/x86/kvm/x86.c1
-rw-r--r--arch/x86/lib/insn.c119
-rw-r--r--arch/x86/lib/retpoline.S2
-rw-r--r--arch/x86/mm/fault.c403
-rw-r--r--arch/x86/mm/init.c19
-rw-r--r--arch/x86/mm/mem_encrypt.c5
-rw-r--r--arch/x86/mm/mmio-mod.c2
-rw-r--r--arch/x86/pci/init.c15
-rw-r--r--arch/x86/platform/Makefile1
-rw-r--r--arch/x86/platform/efi/efi_64.c33
-rw-r--r--arch/x86/platform/efi/efi_thunk_64.S6
-rw-r--r--arch/x86/platform/efi/quirks.c16
-rw-r--r--arch/x86/platform/geode/alix.c19
-rw-r--r--arch/x86/platform/geode/geos.c19
-rw-r--r--arch/x86/platform/geode/net5501.c13
-rw-r--r--arch/x86/platform/goldfish/Makefile2
-rw-r--r--arch/x86/platform/goldfish/goldfish.c54
-rw-r--r--arch/x86/platform/intel-mid/device_libs/platform_bt.c4
-rw-r--r--arch/x86/platform/pvh/head.S2
-rw-r--r--arch/x86/power/Makefile1
-rw-r--r--arch/x86/power/hibernate_asm_64.S103
-rw-r--r--arch/x86/tools/Makefile8
-rw-r--r--arch/x86/tools/insn_sanity.c4
-rw-r--r--arch/x86/tools/relocs.c12
-rw-r--r--arch/x86/xen/Makefile1
-rw-r--r--arch/x86/xen/enlighten_pv.c32
-rw-r--r--arch/x86/xen/irq.c23
-rw-r--r--arch/x86/xen/xen-asm.S80
-rw-r--r--arch/x86/xen/xen-head.S5
-rw-r--r--arch/x86/xen/xen-ops.h3
-rw-r--r--drivers/clocksource/Kconfig36
-rw-r--r--drivers/clocksource/Makefile5
-rw-r--r--drivers/clocksource/timer-atlas7.c281
-rw-r--r--drivers/clocksource/timer-davinci.c5
-rw-r--r--drivers/clocksource/timer-efm32.c278
-rw-r--r--drivers/clocksource/timer-microchip-pit64b.c86
-rw-r--r--drivers/clocksource/timer-prima2.c242
-rw-r--r--drivers/clocksource/timer-tango-xtal.c57
-rw-r--r--drivers/clocksource/timer-u300.c457
-rw-r--r--drivers/firmware/efi/libstub/arm64-stub.c4
-rw-r--r--drivers/firmware/efi/libstub/efistub.h11
-rw-r--r--drivers/s390/char/sclp_early_core.c4
-rw-r--r--drivers/thermal/cpufreq_cooling.c69
-rw-r--r--drivers/thermal/intel/Kconfig4
-rw-r--r--drivers/thermal/intel/Makefile1
-rw-r--r--drivers/thermal/intel/therm_throt.c (renamed from arch/x86/kernel/cpu/mce/therm_throt.c)41
-rw-r--r--drivers/thermal/intel/thermal_interrupt.h15
-rw-r--r--drivers/thermal/intel/x86_pkg_temp_thermal.c4
-rw-r--r--fs/exec.c4
-rw-r--r--fs/proc/task_mmu.c9
-rw-r--r--include/asm-generic/Kbuild1
-rw-r--r--include/asm-generic/softirq_stack.h14
-rw-r--r--include/asm-generic/tlb.h6
-rw-r--r--include/asm-generic/vmlinux.lds.h5
-rw-r--r--include/linux/cgroup.h4
-rw-r--r--include/linux/compiler.h2
-rw-r--r--include/linux/cpu.h2
-rw-r--r--include/linux/efi.h19
-rw-r--r--include/linux/entry-common.h4
-rw-r--r--include/linux/entry-kvm.h14
-rw-r--r--include/linux/interrupt.h9
-rw-r--r--include/linux/irqflags.h12
-rw-r--r--include/linux/jump_label.h12
-rw-r--r--include/linux/kernel.h23
-rw-r--r--include/linux/list.h2
-rw-r--r--include/linux/local_lock_internal.h5
-rw-r--r--include/linux/lockdep.h15
-rw-r--r--include/linux/lockdep_types.h18
-rw-r--r--include/linux/mm.h2
-rw-r--r--include/linux/mm_types.h7
-rw-r--r--include/linux/mutex.h25
-rw-r--r--include/linux/objtool.h13
-rw-r--r--include/linux/perf_event.h4
-rw-r--r--include/linux/rbtree.h206
-rw-r--r--include/linux/rcu_segcblist.h120
-rw-r--r--include/linux/rcupdate.h44
-rw-r--r--include/linux/sched.h34
-rw-r--r--include/linux/sched/prio.h18
-rw-r--r--include/linux/slab.h2
-rw-r--r--include/linux/srcu.h3
-rw-r--r--include/linux/srcutiny.h7
-rw-r--r--include/linux/static_call.h77
-rw-r--r--include/linux/static_call_types.h50
-rw-r--r--include/linux/syscalls.h8
-rw-r--r--include/linux/timer.h2
-rw-r--r--include/linux/topology.h1
-rw-r--r--include/linux/torture.h27
-rw-r--r--include/linux/vmalloc.h6
-rw-r--r--include/trace/events/rcu.h26
-rw-r--r--include/uapi/linux/perf_event.h54
-rw-r--r--init/Kconfig2
-rw-r--r--kernel/Kconfig.preempt19
-rw-r--r--kernel/cpu.c7
-rw-r--r--kernel/entry/common.c17
-rw-r--r--kernel/events/core.c206
-rw-r--r--kernel/events/uprobes.c80
-rw-r--r--kernel/futex.c13
-rw-r--r--kernel/kcsan/core.c26
-rw-r--r--kernel/locking/Makefile1
-rw-r--r--kernel/locking/irqflag-debug.c13
-rw-r--r--kernel/locking/lockdep.c186
-rw-r--r--kernel/locking/locktorture.c1
-rw-r--r--kernel/locking/mutex.c10
-rw-r--r--kernel/locking/rtmutex.c71
-rw-r--r--kernel/locking/rwsem.h0
-rw-r--r--kernel/rcu/Kconfig5
-rw-r--r--kernel/rcu/rcu.h16
-rw-r--r--kernel/rcu/rcu_segcblist.c216
-rw-r--r--kernel/rcu/rcu_segcblist.h57
-rw-r--r--kernel/rcu/rcutorture.c395
-rw-r--r--kernel/rcu/refscale.c23
-rw-r--r--kernel/rcu/srcutiny.c77
-rw-r--r--kernel/rcu/srcutree.c147
-rw-r--r--kernel/rcu/tasks.h79
-rw-r--r--kernel/rcu/tree.c154
-rw-r--r--kernel/rcu/tree.h4
-rw-r--r--kernel/rcu/tree_exp.h2
-rw-r--r--kernel/rcu/tree_plugin.h398
-rw-r--r--kernel/rcu/tree_stall.h60
-rw-r--r--kernel/rcu/update.c4
-rw-r--r--kernel/scftorture.c6
-rw-r--r--kernel/sched/core.c366
-rw-r--r--kernel/sched/cpufreq_schedutil.c108
-rw-r--r--kernel/sched/deadline.c94
-rw-r--r--kernel/sched/debug.c2
-rw-r--r--kernel/sched/fair.c322
-rw-r--r--kernel/sched/features.h2
-rw-r--r--kernel/sched/idle.c1
-rw-r--r--kernel/sched/rt.c2
-rw-r--r--kernel/sched/sched.h51
-rw-r--r--kernel/sched/topology.c99
-rw-r--r--kernel/softirq.c2
-rw-r--r--kernel/static_call.c60
-rw-r--r--kernel/time/alarmtimer.c8
-rw-r--r--kernel/time/hrtimer.c32
-rw-r--r--kernel/time/namespace.c6
-rw-r--r--kernel/time/timer.c14
-rw-r--r--kernel/torture.c167
-rw-r--r--lib/Kconfig.debug8
-rw-r--r--lib/Makefile3
-rw-r--r--lib/locking-selftest.c334
-rw-r--r--lib/percpu-refcount.c12
-rw-r--r--lib/test_fpu.c6
-rw-r--r--lib/timerqueue.c28
-rw-r--r--mm/hugetlb.c18
-rw-r--r--mm/madvise.c12
-rw-r--r--mm/memory.c8
-rw-r--r--mm/mmap.c8
-rw-r--r--mm/mmu_gather.c31
-rw-r--r--mm/oom_kill.c6
-rw-r--r--mm/slab.c20
-rw-r--r--mm/slab.h12
-rw-r--r--mm/slab_common.c75
-rw-r--r--mm/slob.c6
-rw-r--r--mm/slub.c40
-rw-r--r--mm/util.c31
-rw-r--r--mm/vmalloc.c13
-rwxr-xr-xscripts/checkpatch.pl6
-rw-r--r--tools/arch/x86/include/asm/disabled-features.h3
-rw-r--r--tools/arch/x86/include/asm/insn.h45
-rw-r--r--tools/arch/x86/include/asm/orc_types.h10
-rw-r--r--tools/arch/x86/include/asm/required-features.h3
-rw-r--r--tools/arch/x86/lib/insn.c119
-rw-r--r--tools/include/linux/objtool.h13
-rw-r--r--tools/include/linux/rbtree.h192
-rw-r--r--tools/include/linux/static_call_types.h50
-rw-r--r--tools/include/nolibc/nolibc.h153
-rw-r--r--tools/memory-model/Documentation/glossary.txt12
-rw-r--r--tools/memory-model/README2
-rw-r--r--tools/memory-model/litmus-tests/CoRR+poonceonce+Once.litmus4
-rw-r--r--tools/memory-model/litmus-tests/CoRW+poonceonce+Once.litmus4
-rw-r--r--tools/memory-model/litmus-tests/CoWR+poonceonce+Once.litmus4
-rw-r--r--tools/memory-model/litmus-tests/CoWW+poonceonce.litmus4
-rw-r--r--tools/memory-model/litmus-tests/IRIW+fencembonceonces+OnceOnce.litmus5
-rw-r--r--tools/memory-model/litmus-tests/IRIW+poonceonces+OnceOnce.litmus5
-rw-r--r--tools/memory-model/litmus-tests/ISA2+pooncelock+pooncelock+pombonce.litmus7
-rw-r--r--tools/memory-model/litmus-tests/ISA2+poonceonces.litmus6
-rw-r--r--tools/memory-model/litmus-tests/ISA2+pooncerelease+poacquirerelease+poacquireonce.litmus6
-rw-r--r--tools/memory-model/litmus-tests/LB+fencembonceonce+ctrlonceonce.litmus5
-rw-r--r--tools/memory-model/litmus-tests/LB+poacquireonce+pooncerelease.litmus5
-rw-r--r--tools/memory-model/litmus-tests/LB+poonceonces.litmus5
-rw-r--r--tools/memory-model/litmus-tests/MP+fencewmbonceonce+fencermbonceonce.litmus5
-rw-r--r--tools/memory-model/litmus-tests/MP+onceassign+derefonce.litmus4
-rw-r--r--tools/memory-model/litmus-tests/MP+polockmbonce+poacquiresilsil.litmus5
-rw-r--r--tools/memory-model/litmus-tests/MP+polockonce+poacquiresilsil.litmus5
-rw-r--r--tools/memory-model/litmus-tests/MP+polocks.litmus6
-rw-r--r--tools/memory-model/litmus-tests/MP+poonceonces.litmus5
-rw-r--r--tools/memory-model/litmus-tests/MP+pooncerelease+poacquireonce.litmus5
-rw-r--r--tools/memory-model/litmus-tests/MP+porevlocks.litmus6
-rw-r--r--tools/memory-model/litmus-tests/R+fencembonceonces.litmus5
-rw-r--r--tools/memory-model/litmus-tests/R+poonceonces.litmus5
-rw-r--r--tools/memory-model/litmus-tests/S+fencewmbonceonce+poacquireonce.litmus5
-rw-r--r--tools/memory-model/litmus-tests/S+poonceonces.litmus5
-rw-r--r--tools/memory-model/litmus-tests/SB+fencembonceonces.litmus5
-rw-r--r--tools/memory-model/litmus-tests/SB+poonceonces.litmus5
-rw-r--r--tools/memory-model/litmus-tests/SB+rfionceonce-poonceonces.litmus5
-rw-r--r--tools/memory-model/litmus-tests/WRC+poonceonces+Once.litmus5
-rw-r--r--tools/memory-model/litmus-tests/WRC+pooncerelease+fencermbonceonce+Once.litmus5
-rw-r--r--tools/memory-model/litmus-tests/Z6.0+pooncelock+poonceLock+pombonce.litmus7
-rw-r--r--tools/memory-model/litmus-tests/Z6.0+pooncelock+pooncelock+pombonce.litmus7
-rw-r--r--tools/memory-model/litmus-tests/Z6.0+pooncerelease+poacquirerelease+fencembonceonce.litmus6
-rw-r--r--tools/objtool/.gitignore2
-rw-r--r--tools/objtool/Documentation/stack-validation.txt16
-rw-r--r--tools/objtool/Makefile5
-rw-r--r--tools/objtool/arch/x86/decode.c54
-rw-r--r--tools/objtool/arch/x86/include/arch/cfi_regs.h (renamed from tools/objtool/arch/x86/include/cfi_regs.h)0
-rw-r--r--tools/objtool/arch/x86/include/arch/elf.h (renamed from tools/objtool/arch/x86/include/arch_elf.h)0
-rw-r--r--tools/objtool/arch/x86/include/arch/endianness.h9
-rw-r--r--tools/objtool/arch/x86/include/arch/special.h (renamed from tools/objtool/arch/x86/include/arch_special.h)0
-rw-r--r--tools/objtool/arch/x86/special.c6
-rw-r--r--tools/objtool/builtin-check.c4
-rw-r--r--tools/objtool/builtin-orc.c10
-rw-r--r--tools/objtool/check.c419
-rw-r--r--tools/objtool/elf.c113
-rw-r--r--tools/objtool/include/objtool/arch.h (renamed from tools/objtool/arch.h)8
-rw-r--r--tools/objtool/include/objtool/builtin.h (renamed from tools/objtool/builtin.h)0
-rw-r--r--tools/objtool/include/objtool/cfi.h (renamed from tools/objtool/cfi.h)2
-rw-r--r--tools/objtool/include/objtool/check.h (renamed from tools/objtool/check.h)38
-rw-r--r--tools/objtool/include/objtool/elf.h (renamed from tools/objtool/elf.h)0
-rw-r--r--tools/objtool/include/objtool/endianness.h38
-rw-r--r--tools/objtool/include/objtool/objtool.h (renamed from tools/objtool/objtool.h)5
-rw-r--r--tools/objtool/include/objtool/special.h (renamed from tools/objtool/special.h)4
-rw-r--r--tools/objtool/include/objtool/warn.h (renamed from tools/objtool/warn.h)2
-rw-r--r--tools/objtool/objtool.c6
-rw-r--r--tools/objtool/orc_dump.c11
-rw-r--r--tools/objtool/orc_gen.c315
-rw-r--r--tools/objtool/special.c14
-rw-r--r--tools/objtool/weak.c9
-rwxr-xr-xtools/testing/selftests/rcutorture/bin/config2csv.sh67
-rwxr-xr-xtools/testing/selftests/rcutorture/bin/console-badness.sh1
-rw-r--r--tools/testing/selftests/rcutorture/bin/functions.sh36
-rwxr-xr-xtools/testing/selftests/rcutorture/bin/kvm-find-errors.sh9
-rwxr-xr-xtools/testing/selftests/rcutorture/bin/kvm-recheck.sh3
-rwxr-xr-xtools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh12
-rwxr-xr-xtools/testing/selftests/rcutorture/bin/kvm.sh103
-rwxr-xr-xtools/testing/selftests/rcutorture/bin/mkinitrd.sh2
-rwxr-xr-xtools/testing/selftests/rcutorture/bin/parse-build.sh2
-rwxr-xr-xtools/testing/selftests/rcutorture/bin/parse-console.sh2
-rwxr-xr-xtools/testing/selftests/rcutorture/bin/torture.sh442
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcu/RUDE01.boot1
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcu/TASKS01.boot1
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcu/TREE01.boot4
-rw-r--r--tools/testing/selftests/x86/helpers.h24
370 files changed, 8558 insertions, 5334 deletions
diff --git a/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.rst b/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.rst
index 72f0f6fbd53c0..6f89cf1e567d0 100644
--- a/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.rst
+++ b/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.rst
@@ -38,7 +38,7 @@ sections.
RCU-preempt Expedited Grace Periods
===================================
-``CONFIG_PREEMPT=y`` kernels implement RCU-preempt.
+``CONFIG_PREEMPTION=y`` kernels implement RCU-preempt.
The overall flow of the handling of a given CPU by an RCU-preempt
expedited grace period is shown in the following diagram:
@@ -112,7 +112,7 @@ things.
RCU-sched Expedited Grace Periods
---------------------------------
-``CONFIG_PREEMPT=n`` kernels implement RCU-sched. The overall flow of
+``CONFIG_PREEMPTION=n`` kernels implement RCU-sched. The overall flow of
the handling of a given CPU by an RCU-sched expedited grace period is
shown in the following diagram:
diff --git a/Documentation/RCU/Design/Requirements/Requirements.rst b/Documentation/RCU/Design/Requirements/Requirements.rst
index d4c9a016074b3..38a39476fc248 100644
--- a/Documentation/RCU/Design/Requirements/Requirements.rst
+++ b/Documentation/RCU/Design/Requirements/Requirements.rst
@@ -72,13 +72,13 @@ understanding of this guarantee.
RCU's grace-period guarantee allows updaters to wait for the completion
of all pre-existing RCU read-side critical sections. An RCU read-side
-critical section begins with the marker ``rcu_read_lock()`` and ends
-with the marker ``rcu_read_unlock()``. These markers may be nested, and
+critical section begins with the marker rcu_read_lock() and ends
+with the marker rcu_read_unlock(). These markers may be nested, and
RCU treats a nested set as one big RCU read-side critical section.
-Production-quality implementations of ``rcu_read_lock()`` and
-``rcu_read_unlock()`` are extremely lightweight, and in fact have
+Production-quality implementations of rcu_read_lock() and
+rcu_read_unlock() are extremely lightweight, and in fact have
exactly zero overhead in Linux kernels built for production use with
-``CONFIG_PREEMPT=n``.
+``CONFIG_PREEMPTION=n``.
This guarantee allows ordering to be enforced with extremely low
overhead to readers, for example:
@@ -102,12 +102,12 @@ overhead to readers, for example:
15 WRITE_ONCE(y, 1);
16 }
-Because the ``synchronize_rcu()`` on line 14 waits for all pre-existing
-readers, any instance of ``thread0()`` that loads a value of zero from
-``x`` must complete before ``thread1()`` stores to ``y``, so that
+Because the synchronize_rcu() on line 14 waits for all pre-existing
+readers, any instance of thread0() that loads a value of zero from
+``x`` must complete before thread1() stores to ``y``, so that
instance must also load a value of zero from ``y``. Similarly, any
-instance of ``thread0()`` that loads a value of one from ``y`` must have
-started after the ``synchronize_rcu()`` started, and must therefore also
+instance of thread0() that loads a value of one from ``y`` must have
+started after the synchronize_rcu() started, and must therefore also
load a value of one from ``x``. Therefore, the outcome:
::
@@ -121,14 +121,14 @@ cannot happen.
+-----------------------------------------------------------------------+
| Wait a minute! You said that updaters can make useful forward |
| progress concurrently with readers, but pre-existing readers will |
-| block ``synchronize_rcu()``!!! |
+| block synchronize_rcu()!!! |
| Just who are you trying to fool??? |
+-----------------------------------------------------------------------+
| **Answer**: |
+-----------------------------------------------------------------------+
| First, if updaters do not wish to be blocked by readers, they can use |
-| ``call_rcu()`` or ``kfree_rcu()``, which will be discussed later. |
-| Second, even when using ``synchronize_rcu()``, the other update-side |
+| call_rcu() or kfree_rcu(), which will be discussed later. |
+| Second, even when using synchronize_rcu(), the other update-side |
| code does run concurrently with readers, whether pre-existing or not. |
+-----------------------------------------------------------------------+
@@ -170,34 +170,34 @@ recovery from node failure, more or less as follows:
29 WRITE_ONCE(state, STATE_NORMAL);
30 }
-The RCU read-side critical section in ``do_something_dlm()`` works with
-the ``synchronize_rcu()`` in ``start_recovery()`` to guarantee that
-``do_something()`` never runs concurrently with ``recovery()``, but with
-little or no synchronization overhead in ``do_something_dlm()``.
+The RCU read-side critical section in do_something_dlm() works with
+the synchronize_rcu() in start_recovery() to guarantee that
+do_something() never runs concurrently with recovery(), but with
+little or no synchronization overhead in do_something_dlm().
+-----------------------------------------------------------------------+
| **Quick Quiz**: |
+-----------------------------------------------------------------------+
-| Why is the ``synchronize_rcu()`` on line 28 needed? |
+| Why is the synchronize_rcu() on line 28 needed? |
+-----------------------------------------------------------------------+
| **Answer**: |
+-----------------------------------------------------------------------+
| Without that extra grace period, memory reordering could result in |
-| ``do_something_dlm()`` executing ``do_something()`` concurrently with |
-| the last bits of ``recovery()``. |
+| do_something_dlm() executing do_something() concurrently with |
+| the last bits of recovery(). |
+-----------------------------------------------------------------------+
In order to avoid fatal problems such as deadlocks, an RCU read-side
-critical section must not contain calls to ``synchronize_rcu()``.
+critical section must not contain calls to synchronize_rcu().
Similarly, an RCU read-side critical section must not contain anything
that waits, directly or indirectly, on completion of an invocation of
-``synchronize_rcu()``.
+synchronize_rcu().
Although RCU's grace-period guarantee is useful in and of itself, with
`quite a few use cases <https://lwn.net/Articles/573497/>`__, it would
be good to be able to use RCU to coordinate read-side access to linked
data structures. For this, the grace-period guarantee is not sufficient,
-as can be seen in function ``add_gp_buggy()`` below. We will look at the
+as can be seen in function add_gp_buggy() below. We will look at the
reader's code later, but in the meantime, just think of the reader as
locklessly picking up the ``gp`` pointer, and, if the value loaded is
non-\ ``NULL``, locklessly accessing the ``->a`` and ``->b`` fields.
@@ -256,8 +256,8 @@ Publish/Subscribe Guarantee
RCU's publish-subscribe guarantee allows data to be inserted into a
linked data structure without disrupting RCU readers. The updater uses
-``rcu_assign_pointer()`` to insert the new data, and readers use
-``rcu_dereference()`` to access data, whether new or old. The following
+rcu_assign_pointer() to insert the new data, and readers use
+rcu_dereference() to access data, whether new or old. The following
shows an example of insertion:
::
@@ -279,7 +279,7 @@ shows an example of insertion:
15 return true;
16 }
-The ``rcu_assign_pointer()`` on line 13 is conceptually equivalent to a
+The rcu_assign_pointer() on line 13 is conceptually equivalent to a
simple assignment statement, but also guarantees that its assignment
will happen after the two assignments in lines 11 and 12, similar to the
C11 ``memory_order_release`` store operation. It also prevents any
@@ -289,7 +289,7 @@ number of “interesting” compiler optimizations, for example, the use of
+-----------------------------------------------------------------------+
| **Quick Quiz**: |
+-----------------------------------------------------------------------+
-| But ``rcu_assign_pointer()`` does nothing to prevent the two |
+| But rcu_assign_pointer() does nothing to prevent the two |
| assignments to ``p->a`` and ``p->b`` from being reordered. Can't that |
| also cause problems? |
+-----------------------------------------------------------------------+
@@ -303,7 +303,7 @@ number of “interesting” compiler optimizations, for example, the use of
It is tempting to assume that the reader need not do anything special to
control its accesses to the RCU-protected data, as shown in
-``do_something_gp_buggy()`` below:
+do_something_gp_buggy() below:
::
@@ -321,11 +321,10 @@ control its accesses to the RCU-protected data, as shown in
12 }
However, this temptation must be resisted because there are a
-surprisingly large number of ways that the compiler (to say nothing of
-`DEC Alpha CPUs <https://h71000.www7.hp.com/wizard/wiz_2637.html>`__)
-can trip this code up. For but one example, if the compiler were short
-of registers, it might choose to refetch from ``gp`` rather than keeping
-a separate copy in ``p`` as follows:
+surprisingly large number of ways that the compiler (or weak ordering
+CPUs like the DEC Alpha) can trip this code up. For but one example, if
+the compiler were short of registers, it might choose to refetch from
+``gp`` rather than keeping a separate copy in ``p`` as follows:
::
@@ -345,7 +344,7 @@ If this function ran concurrently with a series of updates that replaced
the current structure with a new one, the fetches of ``gp->a`` and
``gp->b`` might well come from two different structures, which could
cause serious confusion. To prevent this (and much else besides),
-``do_something_gp()`` uses ``rcu_dereference()`` to fetch from ``gp``:
+do_something_gp() uses rcu_dereference() to fetch from ``gp``:
::
@@ -362,21 +361,21 @@ cause serious confusion. To prevent this (and much else besides),
11 return false;
12 }
-The ``rcu_dereference()`` uses volatile casts and (for DEC Alpha) memory
+The rcu_dereference() uses volatile casts and (for DEC Alpha) memory
barriers in the Linux kernel. Should a `high-quality implementation of
C11 ``memory_order_consume``
[PDF] <http://www.rdrop.com/users/paulmck/RCU/consume.2015.07.13a.pdf>`__
-ever appear, then ``rcu_dereference()`` could be implemented as a
+ever appear, then rcu_dereference() could be implemented as a
``memory_order_consume`` load. Regardless of the exact implementation, a
-pointer fetched by ``rcu_dereference()`` may not be used outside of the
+pointer fetched by rcu_dereference() may not be used outside of the
outermost RCU read-side critical section containing that
-``rcu_dereference()``, unless protection of the corresponding data
+rcu_dereference(), unless protection of the corresponding data
element has been passed from RCU to some other synchronization
mechanism, most commonly locking or `reference
counting <https://www.kernel.org/doc/Documentation/RCU/rcuref.txt>`__.
-In short, updaters use ``rcu_assign_pointer()`` and readers use
-``rcu_dereference()``, and these two RCU API elements work together to
+In short, updaters use rcu_assign_pointer() and readers use
+rcu_dereference(), and these two RCU API elements work together to
ensure that readers have a consistent view of newly added data elements.
Of course, it is also necessary to remove elements from RCU-protected
@@ -388,9 +387,9 @@ data structures, for example, using the following process:
the newly removed data element).
#. At this point, only the updater has a reference to the newly removed
data element, so it can safely reclaim the data element, for example,
- by passing it to ``kfree()``.
+ by passing it to kfree().
-This process is implemented by ``remove_gp_synchronous()``:
+This process is implemented by remove_gp_synchronous():
::
@@ -413,16 +412,16 @@ This process is implemented by ``remove_gp_synchronous()``:
This function is straightforward, with line 13 waiting for a grace
period before line 14 frees the old data element. This waiting ensures
-that readers will reach line 7 of ``do_something_gp()`` before the data
-element referenced by ``p`` is freed. The ``rcu_access_pointer()`` on
-line 6 is similar to ``rcu_dereference()``, except that:
+that readers will reach line 7 of do_something_gp() before the data
+element referenced by ``p`` is freed. The rcu_access_pointer() on
+line 6 is similar to rcu_dereference(), except that:
-#. The value returned by ``rcu_access_pointer()`` cannot be
+#. The value returned by rcu_access_pointer() cannot be
dereferenced. If you want to access the value pointed to as well as
- the pointer itself, use ``rcu_dereference()`` instead of
- ``rcu_access_pointer()``.
-#. The call to ``rcu_access_pointer()`` need not be protected. In
- contrast, ``rcu_dereference()`` must either be within an RCU
+ the pointer itself, use rcu_dereference() instead of
+ rcu_access_pointer().
+#. The call to rcu_access_pointer() need not be protected. In
+ contrast, rcu_dereference() must either be within an RCU
read-side critical section or in a code segment where the pointer
cannot change, for example, in code protected by the corresponding
update-side lock.
@@ -430,13 +429,13 @@ line 6 is similar to ``rcu_dereference()``, except that:
+-----------------------------------------------------------------------+
| **Quick Quiz**: |
+-----------------------------------------------------------------------+
-| Without the ``rcu_dereference()`` or the ``rcu_access_pointer()``, |
+| Without the rcu_dereference() or the rcu_access_pointer(), |
| what destructive optimizations might the compiler make use of? |
+-----------------------------------------------------------------------+
| **Answer**: |
+-----------------------------------------------------------------------+
-| Let's start with what happens to ``do_something_gp()`` if it fails to |
-| use ``rcu_dereference()``. It could reuse a value formerly fetched |
+| Let's start with what happens to do_something_gp() if it fails to |
+| use rcu_dereference(). It could reuse a value formerly fetched |
| from this same pointer. It could also fetch the pointer from ``gp`` |
| in a byte-at-a-time manner, resulting in *load tearing*, in turn |
| resulting a bytewise mash-up of two distinct pointer values. It might |
@@ -445,15 +444,15 @@ line 6 is similar to ``rcu_dereference()``, except that:
| update has changed the pointer to match the wrong guess. Too bad |
| about any dereferences that returned pre-initialization garbage in |
| the meantime! |
-| For ``remove_gp_synchronous()``, as long as all modifications to |
+| For remove_gp_synchronous(), as long as all modifications to |
| ``gp`` are carried out while holding ``gp_lock``, the above |
| optimizations are harmless. However, ``sparse`` will complain if you |
| define ``gp`` with ``__rcu`` and then access it without using either |
-| ``rcu_access_pointer()`` or ``rcu_dereference()``. |
+| rcu_access_pointer() or rcu_dereference(). |
+-----------------------------------------------------------------------+
In short, RCU's publish-subscribe guarantee is provided by the
-combination of ``rcu_assign_pointer()`` and ``rcu_dereference()``. This
+combination of rcu_assign_pointer() and rcu_dereference(). This
guarantee allows data elements to be safely added to RCU-protected
linked data structures without disrupting RCU readers. This guarantee
can be used in combination with the grace-period guarantee to also allow
@@ -462,9 +461,9 @@ again without disrupting RCU readers.
This guarantee was only partially premeditated. DYNIX/ptx used an
explicit memory barrier for publication, but had nothing resembling
-``rcu_dereference()`` for subscription, nor did it have anything
+rcu_dereference() for subscription, nor did it have anything
resembling the dependency-ordering barrier that was later subsumed
-into ``rcu_dereference()`` and later still into ``READ_ONCE()``. The
+into rcu_dereference() and later still into READ_ONCE(). The
need for these operations made itself known quite suddenly at a
late-1990s meeting with the DEC Alpha architects, back in the days when
DEC was still a free-standing company. It took the Alpha architects a
@@ -474,7 +473,7 @@ documentation did not make this point clear. More recent work with the C
and C++ standards committees have provided much education on tricks and
traps from the compiler. In short, compilers were much less tricky in
the early 1990s, but in 2015, don't even think about omitting
-``rcu_dereference()``!
+rcu_dereference()!
Memory-Barrier Guarantees
~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -484,31 +483,31 @@ demonstrates the need for RCU's stringent memory-ordering guarantees on
systems with more than one CPU:
#. Each CPU that has an RCU read-side critical section that begins
- before ``synchronize_rcu()`` starts is guaranteed to execute a full
+ before synchronize_rcu() starts is guaranteed to execute a full
memory barrier between the time that the RCU read-side critical
- section ends and the time that ``synchronize_rcu()`` returns. Without
+ section ends and the time that synchronize_rcu() returns. Without
this guarantee, a pre-existing RCU read-side critical section might
hold a reference to the newly removed ``struct foo`` after the
- ``kfree()`` on line 14 of ``remove_gp_synchronous()``.
+ kfree() on line 14 of remove_gp_synchronous().
#. Each CPU that has an RCU read-side critical section that ends after
- ``synchronize_rcu()`` returns is guaranteed to execute a full memory
- barrier between the time that ``synchronize_rcu()`` begins and the
+ synchronize_rcu() returns is guaranteed to execute a full memory
+ barrier between the time that synchronize_rcu() begins and the
time that the RCU read-side critical section begins. Without this
guarantee, a later RCU read-side critical section running after the
- ``kfree()`` on line 14 of ``remove_gp_synchronous()`` might later run
- ``do_something_gp()`` and find the newly deleted ``struct foo``.
-#. If the task invoking ``synchronize_rcu()`` remains on a given CPU,
+ kfree() on line 14 of remove_gp_synchronous() might later run
+ do_something_gp() and find the newly deleted ``struct foo``.
+#. If the task invoking synchronize_rcu() remains on a given CPU,
then that CPU is guaranteed to execute a full memory barrier sometime
- during the execution of ``synchronize_rcu()``. This guarantee ensures
- that the ``kfree()`` on line 14 of ``remove_gp_synchronous()`` really
+ during the execution of synchronize_rcu(). This guarantee ensures
+ that the kfree() on line 14 of remove_gp_synchronous() really
does execute after the removal on line 11.
-#. If the task invoking ``synchronize_rcu()`` migrates among a group of
+#. If the task invoking synchronize_rcu() migrates among a group of
CPUs during that invocation, then each of the CPUs in that group is
guaranteed to execute a full memory barrier sometime during the
- execution of ``synchronize_rcu()``. This guarantee also ensures that
- the ``kfree()`` on line 14 of ``remove_gp_synchronous()`` really does
+ execution of synchronize_rcu(). This guarantee also ensures that
+ the kfree() on line 14 of remove_gp_synchronous() really does
execute after the removal on line 11, but also in the case where the
- thread executing the ``synchronize_rcu()`` migrates in the meantime.
+ thread executing the synchronize_rcu() migrates in the meantime.
+-----------------------------------------------------------------------+
| **Quick Quiz**: |
@@ -516,19 +515,19 @@ systems with more than one CPU:
| Given that multiple CPUs can start RCU read-side critical sections at |
| any time without any ordering whatsoever, how can RCU possibly tell |
| whether or not a given RCU read-side critical section starts before a |
-| given instance of ``synchronize_rcu()``? |
+| given instance of synchronize_rcu()? |
+-----------------------------------------------------------------------+
| **Answer**: |
+-----------------------------------------------------------------------+
| If RCU cannot tell whether or not a given RCU read-side critical |
-| section starts before a given instance of ``synchronize_rcu()``, then |
+| section starts before a given instance of synchronize_rcu(), then |
| it must assume that the RCU read-side critical section started first. |
-| In other words, a given instance of ``synchronize_rcu()`` can avoid |
+| In other words, a given instance of synchronize_rcu() can avoid |
| waiting on a given RCU read-side critical section only if it can |
-| prove that ``synchronize_rcu()`` started first. |
-| A related question is “When ``rcu_read_lock()`` doesn't generate any |
+| prove that synchronize_rcu() started first. |
+| A related question is “When rcu_read_lock() doesn't generate any |
| code, why does it matter how it relates to a grace period?” The |
-| answer is that it is not the relationship of ``rcu_read_lock()`` |
+| answer is that it is not the relationship of rcu_read_lock() |
| itself that is important, but rather the relationship of the code |
| within the enclosed RCU read-side critical section to the code |
| preceding and following the grace period. If we take this viewpoint, |
@@ -556,14 +555,14 @@ systems with more than one CPU:
| Yes, they really are required. To see why the first guarantee is |
| required, consider the following sequence of events: |
| |
-| #. CPU 1: ``rcu_read_lock()`` |
+| #. CPU 1: rcu_read_lock() |
| #. CPU 1: ``q = rcu_dereference(gp); /* Very likely to return p. */`` |
| #. CPU 0: ``list_del_rcu(p);`` |
-| #. CPU 0: ``synchronize_rcu()`` starts. |
+| #. CPU 0: synchronize_rcu() starts. |
| #. CPU 1: ``do_something_with(q->a);`` |
| ``/* No smp_mb(), so might happen after kfree(). */`` |
-| #. CPU 1: ``rcu_read_unlock()`` |
-| #. CPU 0: ``synchronize_rcu()`` returns. |
+| #. CPU 1: rcu_read_unlock() |
+| #. CPU 0: synchronize_rcu() returns. |
| #. CPU 0: ``kfree(p);`` |
| |
| Therefore, there absolutely must be a full memory barrier between the |
@@ -574,14 +573,14 @@ systems with more than one CPU:
| is roughly similar: |
| |
| #. CPU 0: ``list_del_rcu(p);`` |
-| #. CPU 0: ``synchronize_rcu()`` starts. |
-| #. CPU 1: ``rcu_read_lock()`` |
+| #. CPU 0: synchronize_rcu() starts. |
+| #. CPU 1: rcu_read_lock() |
| #. CPU 1: ``q = rcu_dereference(gp);`` |
| ``/* Might return p if no memory barrier. */`` |
-| #. CPU 0: ``synchronize_rcu()`` returns. |
+| #. CPU 0: synchronize_rcu() returns. |
| #. CPU 0: ``kfree(p);`` |
| #. CPU 1: ``do_something_with(q->a); /* Boom!!! */`` |
-| #. CPU 1: ``rcu_read_unlock()`` |
+| #. CPU 1: rcu_read_unlock() |
| |
| And similarly, without a memory barrier between the beginning of the |
| grace period and the beginning of the RCU read-side critical section, |
@@ -597,7 +596,7 @@ systems with more than one CPU:
+-----------------------------------------------------------------------+
| **Quick Quiz**: |
+-----------------------------------------------------------------------+
-| You claim that ``rcu_read_lock()`` and ``rcu_read_unlock()`` generate |
+| You claim that rcu_read_lock() and rcu_read_unlock() generate |
| absolutely no code in some kernel builds. This means that the |
| compiler might arbitrarily rearrange consecutive RCU read-side |
| critical sections. Given such rearrangement, if a given RCU read-side |
@@ -607,11 +606,11 @@ systems with more than one CPU:
+-----------------------------------------------------------------------+
| **Answer**: |
+-----------------------------------------------------------------------+
-| In cases where ``rcu_read_lock()`` and ``rcu_read_unlock()`` generate |
+| In cases where rcu_read_lock() and rcu_read_unlock() generate |
| absolutely no code, RCU infers quiescent states only at special |
| locations, for example, within the scheduler. Because calls to |
-| ``schedule()`` had better prevent calling-code accesses to shared |
-| variables from being rearranged across the call to ``schedule()``, if |
+| schedule() had better prevent calling-code accesses to shared |
+| variables from being rearranged across the call to schedule(), if |
| RCU detects the end of a given RCU read-side critical section, it |
| will necessarily detect the end of all prior RCU read-side critical |
| sections, no matter how aggressively the compiler scrambles the code. |
@@ -655,8 +654,8 @@ read-side critical section might search for a given data element, and
then might acquire the update-side spinlock in order to update that
element, all while remaining in that RCU read-side critical section. Of
course, it is necessary to exit the RCU read-side critical section
-before invoking ``synchronize_rcu()``, however, this inconvenience can
-be avoided through use of the ``call_rcu()`` and ``kfree_rcu()`` API
+before invoking synchronize_rcu(), however, this inconvenience can
+be avoided through use of the call_rcu() and kfree_rcu() API
members described later in this document.
+-----------------------------------------------------------------------+
@@ -694,10 +693,10 @@ these non-guarantees were premeditated.
Readers Impose Minimal Ordering
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Reader-side markers such as ``rcu_read_lock()`` and
-``rcu_read_unlock()`` provide absolutely no ordering guarantees except
+Reader-side markers such as rcu_read_lock() and
+rcu_read_unlock() provide absolutely no ordering guarantees except
through their interaction with the grace-period APIs such as
-``synchronize_rcu()``. To see this, consider the following pair of
+synchronize_rcu(). To see this, consider the following pair of
threads:
::
@@ -722,7 +721,7 @@ threads:
18 rcu_read_unlock();
19 }
-After ``thread0()`` and ``thread1()`` execute concurrently, it is quite
+After thread0() and thread1() execute concurrently, it is quite
possible to have
::
@@ -730,7 +729,7 @@ possible to have
(r1 == 1 && r2 == 0)
(that is, ``y`` appears to have been assigned before ``x``), which would
-not be possible if ``rcu_read_lock()`` and ``rcu_read_unlock()`` had
+not be possible if rcu_read_lock() and rcu_read_unlock() had
much in the way of ordering properties. But they do not, so the CPU is
within its rights to do significant reordering. This is by design: Any
significant ordering constraints would slow down these fast-path APIs.
@@ -742,14 +741,14 @@ significant ordering constraints would slow down these fast-path APIs.
+-----------------------------------------------------------------------+
| **Answer**: |
+-----------------------------------------------------------------------+
-| No, the volatile casts in ``READ_ONCE()`` and ``WRITE_ONCE()`` |
+| No, the volatile casts in READ_ONCE() and WRITE_ONCE() |
| prevent the compiler from reordering in this particular case. |
+-----------------------------------------------------------------------+
Readers Do Not Exclude Updaters
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Neither ``rcu_read_lock()`` nor ``rcu_read_unlock()`` exclude updates.
+Neither rcu_read_lock() nor rcu_read_unlock() exclude updates.
All they do is to prevent grace periods from ending. The following
example illustrates this:
@@ -775,19 +774,19 @@ example illustrates this:
18 spin_unlock(&my_lock);
19 }
-If the ``thread0()`` function's ``rcu_read_lock()`` excluded the
-``thread1()`` function's update, the ``WARN_ON()`` could never fire. But
-the fact is that ``rcu_read_lock()`` does not exclude much of anything
-aside from subsequent grace periods, of which ``thread1()`` has none, so
-the ``WARN_ON()`` can and does fire.
+If the thread0() function's rcu_read_lock() excluded the
+thread1() function's update, the WARN_ON() could never fire. But
+the fact is that rcu_read_lock() does not exclude much of anything
+aside from subsequent grace periods, of which thread1() has none, so
+the WARN_ON() can and does fire.
Updaters Only Wait For Old Readers
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-It might be tempting to assume that after ``synchronize_rcu()``
+It might be tempting to assume that after synchronize_rcu()
completes, there are no readers executing. This temptation must be
avoided because new readers can start immediately after
-``synchronize_rcu()`` starts, and ``synchronize_rcu()`` is under no
+synchronize_rcu() starts, and synchronize_rcu() is under no
obligation to wait for these new readers.
+-----------------------------------------------------------------------+
@@ -799,10 +798,10 @@ obligation to wait for these new readers.
+-----------------------------------------------------------------------+
| **Answer**: |
+-----------------------------------------------------------------------+
-| For no time at all. Even if ``synchronize_rcu()`` were to wait until |
+| For no time at all. Even if synchronize_rcu() were to wait until |
| all readers had completed, a new reader might start immediately after |
-| ``synchronize_rcu()`` completed. Therefore, the code following |
-| ``synchronize_rcu()`` can *never* rely on there being no readers. |
+| synchronize_rcu() completed. Therefore, the code following |
+| synchronize_rcu() can *never* rely on there being no readers. |
+-----------------------------------------------------------------------+
Grace Periods Don't Partition Read-Side Critical Sections
@@ -892,12 +891,12 @@ period is known to end before the second grace period starts:
28 rcu_read_unlock();
29 }
-Here, if ``(r1 == 1)``, then ``thread0()``'s write to ``b`` must happen
-before the end of ``thread1()``'s grace period. If in addition
-``(r4 == 1)``, then ``thread3()``'s read from ``b`` must happen after
-the beginning of ``thread2()``'s grace period. If it is also the case
-that ``(r2 == 1)``, then the end of ``thread1()``'s grace period must
-precede the beginning of ``thread2()``'s grace period. This mean that
+Here, if ``(r1 == 1)``, then thread0()'s write to ``b`` must happen
+before the end of thread1()'s grace period. If in addition
+``(r4 == 1)``, then thread3()'s read from ``b`` must happen after
+the beginning of thread2()'s grace period. If it is also the case
+that ``(r2 == 1)``, then the end of thread1()'s grace period must
+precede the beginning of thread2()'s grace period. This mean that
the two RCU read-side critical sections cannot overlap, guaranteeing
that ``(r3 == 1)``. As a result, the outcome:
@@ -1076,8 +1075,8 @@ is captured by the following list of situations:
b. Wait-free read-side primitives for real-time use.
This focus on read-mostly situations means that RCU must interoperate
-with other synchronization primitives. For example, the ``add_gp()`` and
-``remove_gp_synchronous()`` examples discussed earlier use RCU to
+with other synchronization primitives. For example, the add_gp() and
+remove_gp_synchronous() examples discussed earlier use RCU to
protect readers and locking to coordinate updaters. However, the need
extends much farther, requiring that a variety of synchronization
primitives be legal within RCU read-side critical sections, including
@@ -1104,11 +1103,11 @@ memory barriers.
| sections. |
| Note that it *is* legal for a normal RCU read-side critical section |
| to conditionally acquire a sleeping locks (as in |
-| ``mutex_trylock()``), but only as long as it does not loop |
+| mutex_trylock()), but only as long as it does not loop |
| indefinitely attempting to conditionally acquire that sleeping locks. |
-| The key point is that things like ``mutex_trylock()`` either return |
+| The key point is that things like mutex_trylock() either return |
| with the mutex held, or return an error indication if the mutex was |
-| not immediately available. Either way, ``mutex_trylock()`` returns |
+| not immediately available. Either way, mutex_trylock() returns |
| immediately without sleeping. |
+-----------------------------------------------------------------------+
@@ -1182,8 +1181,8 @@ and has become decreasingly so as memory sizes have expanded and memory
costs have plummeted. However, as I learned from Matt Mackall's
`bloatwatch <http://elinux.org/Linux_Tiny-FAQ>`__ efforts, memory
footprint is critically important on single-CPU systems with
-non-preemptible (``CONFIG_PREEMPT=n``) kernels, and thus `tiny
-RCU <https://lkml.kernel.org/g/20090113221724.GA15307@linux.vnet.ibm.com>`__
+non-preemptible (``CONFIG_PREEMPTION=n``) kernels, and thus `tiny
+RCU <https://lore.kernel.org/r/20090113221724.GA15307@linux.vnet.ibm.com>`__
was born. Josh Triplett has since taken over the small-memory banner
with his `Linux kernel tinification <https://tiny.wiki.kernel.org/>`__
project, which resulted in `SRCU <Sleepable RCU_>`__ becoming optional
@@ -1191,57 +1190,57 @@ for those kernels not needing it.
The remaining performance requirements are, for the most part,
unsurprising. For example, in keeping with RCU's read-side
-specialization, ``rcu_dereference()`` should have negligible overhead
+specialization, rcu_dereference() should have negligible overhead
(for example, suppression of a few minor compiler optimizations).
-Similarly, in non-preemptible environments, ``rcu_read_lock()`` and
-``rcu_read_unlock()`` should have exactly zero overhead.
+Similarly, in non-preemptible environments, rcu_read_lock() and
+rcu_read_unlock() should have exactly zero overhead.
In preemptible environments, in the case where the RCU read-side
critical section was not preempted (as will be the case for the
-highest-priority real-time process), ``rcu_read_lock()`` and
-``rcu_read_unlock()`` should have minimal overhead. In particular, they
+highest-priority real-time process), rcu_read_lock() and
+rcu_read_unlock() should have minimal overhead. In particular, they
should not contain atomic read-modify-write operations, memory-barrier
instructions, preemption disabling, interrupt disabling, or backwards
branches. However, in the case where the RCU read-side critical section
-was preempted, ``rcu_read_unlock()`` may acquire spinlocks and disable
+was preempted, rcu_read_unlock() may acquire spinlocks and disable
interrupts. This is why it is better to nest an RCU read-side critical
section within a preempt-disable region than vice versa, at least in
cases where that critical section is short enough to avoid unduly
degrading real-time latencies.
-The ``synchronize_rcu()`` grace-period-wait primitive is optimized for
+The synchronize_rcu() grace-period-wait primitive is optimized for
throughput. It may therefore incur several milliseconds of latency in
addition to the duration of the longest RCU read-side critical section.
On the other hand, multiple concurrent invocations of
-``synchronize_rcu()`` are required to use batching optimizations so that
+synchronize_rcu() are required to use batching optimizations so that
they can be satisfied by a single underlying grace-period-wait
operation. For example, in the Linux kernel, it is not unusual for a
single grace-period-wait operation to serve more than `1,000 separate
invocations <https://www.usenix.org/conference/2004-usenix-annual-technical-conference/making-rcu-safe-deep-sub-millisecond-response>`__
-of ``synchronize_rcu()``, thus amortizing the per-invocation overhead
+of synchronize_rcu(), thus amortizing the per-invocation overhead
down to nearly zero. However, the grace-period optimization is also
required to avoid measurable degradation of real-time scheduling and
interrupt latencies.
-In some cases, the multi-millisecond ``synchronize_rcu()`` latencies are
-unacceptable. In these cases, ``synchronize_rcu_expedited()`` may be
+In some cases, the multi-millisecond synchronize_rcu() latencies are
+unacceptable. In these cases, synchronize_rcu_expedited() may be
used instead, reducing the grace-period latency down to a few tens of
microseconds on small systems, at least in cases where the RCU read-side
critical sections are short. There are currently no special latency
-requirements for ``synchronize_rcu_expedited()`` on large systems, but,
+requirements for synchronize_rcu_expedited() on large systems, but,
consistent with the empirical nature of the RCU specification, that is
subject to change. However, there most definitely are scalability
-requirements: A storm of ``synchronize_rcu_expedited()`` invocations on
+requirements: A storm of synchronize_rcu_expedited() invocations on
4096 CPUs should at least make reasonable forward progress. In return
-for its shorter latencies, ``synchronize_rcu_expedited()`` is permitted
+for its shorter latencies, synchronize_rcu_expedited() is permitted
to impose modest degradation of real-time latency on non-idle online
CPUs. Here, “modest” means roughly the same latency degradation as a
scheduling-clock interrupt.
There are a number of situations where even
-``synchronize_rcu_expedited()``'s reduced grace-period latency is
-unacceptable. In these situations, the asynchronous ``call_rcu()`` can
-be used in place of ``synchronize_rcu()`` as follows:
+synchronize_rcu_expedited()'s reduced grace-period latency is
+unacceptable. In these situations, the asynchronous call_rcu() can
+be used in place of synchronize_rcu() as follows:
::
@@ -1275,19 +1274,19 @@ be used in place of ``synchronize_rcu()`` as follows:
28 }
A definition of ``struct foo`` is finally needed, and appears on
-lines 1-5. The function ``remove_gp_cb()`` is passed to ``call_rcu()``
+lines 1-5. The function remove_gp_cb() is passed to call_rcu()
on line 25, and will be invoked after the end of a subsequent grace
-period. This gets the same effect as ``remove_gp_synchronous()``, but
+period. This gets the same effect as remove_gp_synchronous(), but
without forcing the updater to wait for a grace period to elapse. The
-``call_rcu()`` function may be used in a number of situations where
-neither ``synchronize_rcu()`` nor ``synchronize_rcu_expedited()`` would
-be legal, including within preempt-disable code, ``local_bh_disable()``
+call_rcu() function may be used in a number of situations where
+neither synchronize_rcu() nor synchronize_rcu_expedited() would
+be legal, including within preempt-disable code, local_bh_disable()
code, interrupt-disable code, and interrupt handlers. However, even
-``call_rcu()`` is illegal within NMI handlers and from idle and offline
-CPUs. The callback function (``remove_gp_cb()`` in this case) will be
+call_rcu() is illegal within NMI handlers and from idle and offline
+CPUs. The callback function (remove_gp_cb() in this case) will be
executed within softirq (software interrupt) environment within the
Linux kernel, either within a real softirq handler or under the
-protection of ``local_bh_disable()``. In both the Linux kernel and in
+protection of local_bh_disable(). In both the Linux kernel and in
userspace, it is bad practice to write an RCU callback function that
takes too long. Long-running operations should be relegated to separate
threads or (in the Linux kernel) workqueues.
@@ -1295,23 +1294,23 @@ threads or (in the Linux kernel) workqueues.
+-----------------------------------------------------------------------+
| **Quick Quiz**: |
+-----------------------------------------------------------------------+
-| Why does line 19 use ``rcu_access_pointer()``? After all, |
-| ``call_rcu()`` on line 25 stores into the structure, which would |
+| Why does line 19 use rcu_access_pointer()? After all, |
+| call_rcu() on line 25 stores into the structure, which would |
| interact badly with concurrent insertions. Doesn't this mean that |
-| ``rcu_dereference()`` is required? |
+| rcu_dereference() is required? |
+-----------------------------------------------------------------------+
| **Answer**: |
+-----------------------------------------------------------------------+
| Presumably the ``->gp_lock`` acquired on line 18 excludes any |
-| changes, including any insertions that ``rcu_dereference()`` would |
+| changes, including any insertions that rcu_dereference() would |
| protect against. Therefore, any insertions will be delayed until |
| after ``->gp_lock`` is released on line 25, which in turn means that |
-| ``rcu_access_pointer()`` suffices. |
+| rcu_access_pointer() suffices. |
+-----------------------------------------------------------------------+
-However, all that ``remove_gp_cb()`` is doing is invoking ``kfree()`` on
+However, all that remove_gp_cb() is doing is invoking kfree() on
the data element. This is a common idiom, and is supported by
-``kfree_rcu()``, which allows “fire and forget” operation as shown
+kfree_rcu(), which allows “fire and forget” operation as shown
below:
::
@@ -1338,20 +1337,20 @@ below:
20 return true;
21 }
-Note that ``remove_gp_faf()`` simply invokes ``kfree_rcu()`` and
+Note that remove_gp_faf() simply invokes kfree_rcu() and
proceeds, without any need to pay any further attention to the
-subsequent grace period and ``kfree()``. It is permissible to invoke
-``kfree_rcu()`` from the same environments as for ``call_rcu()``.
-Interestingly enough, DYNIX/ptx had the equivalents of ``call_rcu()``
-and ``kfree_rcu()``, but not ``synchronize_rcu()``. This was due to the
+subsequent grace period and kfree(). It is permissible to invoke
+kfree_rcu() from the same environments as for call_rcu().
+Interestingly enough, DYNIX/ptx had the equivalents of call_rcu()
+and kfree_rcu(), but not synchronize_rcu(). This was due to the
fact that RCU was not heavily used within DYNIX/ptx, so the very few
-places that needed something like ``synchronize_rcu()`` simply
+places that needed something like synchronize_rcu() simply
open-coded it.
+-----------------------------------------------------------------------+
| **Quick Quiz**: |
+-----------------------------------------------------------------------+
-| Earlier it was claimed that ``call_rcu()`` and ``kfree_rcu()`` |
+| Earlier it was claimed that call_rcu() and kfree_rcu() |
| allowed updaters to avoid being blocked by readers. But how can that |
| be correct, given that the invocation of the callback and the freeing |
| of the memory (respectively) must still wait for a grace period to |
@@ -1363,16 +1362,16 @@ open-coded it.
| definition would say that updates in garbage-collected languages |
| cannot complete until the next time the garbage collector runs, which |
| does not seem at all reasonable. The key point is that in most cases, |
-| an updater using either ``call_rcu()`` or ``kfree_rcu()`` can proceed |
-| to the next update as soon as it has invoked ``call_rcu()`` or |
-| ``kfree_rcu()``, without having to wait for a subsequent grace |
+| an updater using either call_rcu() or kfree_rcu() can proceed |
+| to the next update as soon as it has invoked call_rcu() or |
+| kfree_rcu(), without having to wait for a subsequent grace |
| period. |
+-----------------------------------------------------------------------+
But what if the updater must wait for the completion of code to be
executed after the end of the grace period, but has other tasks that can
be carried out in the meantime? The polling-style
-``get_state_synchronize_rcu()`` and ``cond_synchronize_rcu()`` functions
+get_state_synchronize_rcu() and cond_synchronize_rcu() functions
may be used for this purpose, as shown below:
::
@@ -1397,11 +1396,11 @@ may be used for this purpose, as shown below:
18 return true;
19 }
-On line 14, ``get_state_synchronize_rcu()`` obtains a “cookie” from RCU,
+On line 14, get_state_synchronize_rcu() obtains a “cookie” from RCU,
then line 15 carries out other tasks, and finally, line 16 returns
immediately if a grace period has elapsed in the meantime, but otherwise
waits as required. The need for ``get_state_synchronize_rcu`` and
-``cond_synchronize_rcu()`` has appeared quite recently, so it is too
+cond_synchronize_rcu() has appeared quite recently, so it is too
early to tell whether they will stand the test of time.
RCU thus provides a range of tools to allow updaters to strike the
@@ -1421,8 +1420,8 @@ example, an infinite loop in an RCU read-side critical section must by
definition prevent later grace periods from ever completing. For a more
involved example, consider a 64-CPU system built with
``CONFIG_RCU_NOCB_CPU=y`` and booted with ``rcu_nocbs=1-63``, where
-CPUs 1 through 63 spin in tight loops that invoke ``call_rcu()``. Even
-if these tight loops also contain calls to ``cond_resched()`` (thus
+CPUs 1 through 63 spin in tight loops that invoke call_rcu(). Even
+if these tight loops also contain calls to cond_resched() (thus
allowing grace periods to complete), CPU 0 simply will not be able to
invoke callbacks as fast as the other 63 CPUs can register them, at
least not until the system runs out of memory. In both of these
@@ -1435,21 +1434,21 @@ RCU takes the following steps to encourage timely completion of grace
periods:
#. If a grace period fails to complete within 100 milliseconds, RCU
- causes future invocations of ``cond_resched()`` on the holdout CPUs
+ causes future invocations of cond_resched() on the holdout CPUs
to provide an RCU quiescent state. RCU also causes those CPUs'
- ``need_resched()`` invocations to return ``true``, but only after the
+ need_resched() invocations to return ``true``, but only after the
corresponding CPU's next scheduling-clock.
#. CPUs mentioned in the ``nohz_full`` kernel boot parameter can run
indefinitely in the kernel without scheduling-clock interrupts, which
- defeats the above ``need_resched()`` strategem. RCU will therefore
- invoke ``resched_cpu()`` on any ``nohz_full`` CPUs still holding out
+ defeats the above need_resched() strategem. RCU will therefore
+ invoke resched_cpu() on any ``nohz_full`` CPUs still holding out
after 109 milliseconds.
#. In kernels built with ``CONFIG_RCU_BOOST=y``, if a given task that
has been preempted within an RCU read-side critical section is
holding out for more than 500 milliseconds, RCU will resort to
priority boosting.
#. If a CPU is still holding out 10 seconds into the grace period, RCU
- will invoke ``resched_cpu()`` on it regardless of its ``nohz_full``
+ will invoke resched_cpu() on it regardless of its ``nohz_full``
state.
The above values are defaults for systems running with ``HZ=1000``. They
@@ -1460,7 +1459,7 @@ caution when changing them. Note that these forward-progress measures
are provided only for RCU, not for `SRCU <Sleepable RCU_>`__ or `Tasks
RCU`_.
-RCU takes the following steps in ``call_rcu()`` to encourage timely
+RCU takes the following steps in call_rcu() to encourage timely
invocation of callbacks when any given non-\ ``rcu_nocbs`` CPU has
10,000 callbacks, or has 10,000 more callbacks than it had the last time
encouragement was provided:
@@ -1481,8 +1480,8 @@ RCU, not for `SRCU <Sleepable RCU_>`__ or `Tasks
RCU`_. Even for RCU, callback-invocation forward
progress for ``rcu_nocbs`` CPUs is much less well-developed, in part
because workloads benefiting from ``rcu_nocbs`` CPUs tend to invoke
-``call_rcu()`` relatively infrequently. If workloads emerge that need
-both ``rcu_nocbs`` CPUs and high ``call_rcu()`` invocation rates, then
+call_rcu() relatively infrequently. If workloads emerge that need
+both ``rcu_nocbs`` CPUs and high call_rcu() invocation rates, then
additional forward-progress work will be required.
Composability
@@ -1496,11 +1495,11 @@ in fact may be nested arbitrarily deeply. In practice, as with all
real-world implementations of composable constructs, there are
limitations.
-Implementations of RCU for which ``rcu_read_lock()`` and
-``rcu_read_unlock()`` generate no code, such as Linux-kernel RCU when
-``CONFIG_PREEMPT=n``, can be nested arbitrarily deeply. After all, there
+Implementations of RCU for which rcu_read_lock() and
+rcu_read_unlock() generate no code, such as Linux-kernel RCU when
+``CONFIG_PREEMPTION=n``, can be nested arbitrarily deeply. After all, there
is no overhead. Except that if all these instances of
-``rcu_read_lock()`` and ``rcu_read_unlock()`` are visible to the
+rcu_read_lock() and rcu_read_unlock() are visible to the
compiler, compilation will eventually fail due to exhausting memory,
mass storage, or user patience, whichever comes first. If the nesting is
not visible to the compiler, as is the case with mutually recursive
@@ -1558,11 +1557,11 @@ argue that such workloads should instead use something other than RCU,
the fact remains that RCU must handle such workloads gracefully. This
requirement is another factor driving batching of grace periods, but it
is also the driving force behind the checks for large numbers of queued
-RCU callbacks in the ``call_rcu()`` code path. Finally, high update
+RCU callbacks in the call_rcu() code path. Finally, high update
rates should not delay RCU read-side critical sections, although some
small read-side delays can occur when using
-``synchronize_rcu_expedited()``, courtesy of this function's use of
-``smp_call_function_single()``.
+synchronize_rcu_expedited(), courtesy of this function's use of
+smp_call_function_single().
Although all three of these corner cases were understood in the early
1990s, a simple user-level test consisting of ``close(open(path))`` in a
@@ -1583,48 +1582,48 @@ Software-Engineering Requirements
Between Murphy's Law and “To err is human”, it is necessary to guard
against mishaps and misuse:
-#. It is all too easy to forget to use ``rcu_read_lock()`` everywhere
+#. It is all too easy to forget to use rcu_read_lock() everywhere
that it is needed, so kernels built with ``CONFIG_PROVE_RCU=y`` will
- splat if ``rcu_dereference()`` is used outside of an RCU read-side
+ splat if rcu_dereference() is used outside of an RCU read-side
critical section. Update-side code can use
- ``rcu_dereference_protected()``, which takes a `lockdep
+ rcu_dereference_protected(), which takes a `lockdep
expression <https://lwn.net/Articles/371986/>`__ to indicate what is
providing the protection. If the indicated protection is not
provided, a lockdep splat is emitted.
Code shared between readers and updaters can use
- ``rcu_dereference_check()``, which also takes a lockdep expression,
- and emits a lockdep splat if neither ``rcu_read_lock()`` nor the
+ rcu_dereference_check(), which also takes a lockdep expression,
+ and emits a lockdep splat if neither rcu_read_lock() nor the
indicated protection is in place. In addition,
- ``rcu_dereference_raw()`` is used in those (hopefully rare) cases
+ rcu_dereference_raw() is used in those (hopefully rare) cases
where the required protection cannot be easily described. Finally,
- ``rcu_read_lock_held()`` is provided to allow a function to verify
+ rcu_read_lock_held() is provided to allow a function to verify
that it has been invoked within an RCU read-side critical section. I
was made aware of this set of requirements shortly after Thomas
Gleixner audited a number of RCU uses.
#. A given function might wish to check for RCU-related preconditions
upon entry, before using any other RCU API. The
- ``rcu_lockdep_assert()`` does this job, asserting the expression in
+ rcu_lockdep_assert() does this job, asserting the expression in
kernels having lockdep enabled and doing nothing otherwise.
-#. It is also easy to forget to use ``rcu_assign_pointer()`` and
- ``rcu_dereference()``, perhaps (incorrectly) substituting a simple
+#. It is also easy to forget to use rcu_assign_pointer() and
+ rcu_dereference(), perhaps (incorrectly) substituting a simple
assignment. To catch this sort of error, a given RCU-protected
pointer may be tagged with ``__rcu``, after which sparse will
complain about simple-assignment accesses to that pointer. Arnd
Bergmann made me aware of this requirement, and also supplied the
needed `patch series <https://lwn.net/Articles/376011/>`__.
#. Kernels built with ``CONFIG_DEBUG_OBJECTS_RCU_HEAD=y`` will splat if
- a data element is passed to ``call_rcu()`` twice in a row, without a
+ a data element is passed to call_rcu() twice in a row, without a
grace period in between. (This error is similar to a double free.)
The corresponding ``rcu_head`` structures that are dynamically
allocated are automatically tracked, but ``rcu_head`` structures
allocated on the stack must be initialized with
- ``init_rcu_head_on_stack()`` and cleaned up with
- ``destroy_rcu_head_on_stack()``. Similarly, statically allocated
+ init_rcu_head_on_stack() and cleaned up with
+ destroy_rcu_head_on_stack(). Similarly, statically allocated
non-stack ``rcu_head`` structures must be initialized with
- ``init_rcu_head()`` and cleaned up with ``destroy_rcu_head()``.
+ init_rcu_head() and cleaned up with destroy_rcu_head().
Mathieu Desnoyers made me aware of this requirement, and also
supplied the needed
- `patch <https://lkml.kernel.org/g/20100319013024.GA28456@Krystal>`__.
+ `patch <https://lore.kernel.org/r/20100319013024.GA28456@Krystal>`__.
#. An infinite loop in an RCU read-side critical section will eventually
trigger an RCU CPU stall warning splat, with the duration of
“eventually” being controlled by the ``RCU_CPU_STALL_TIMEOUT``
@@ -1638,9 +1637,9 @@ against mishaps and misuse:
``rcupdate.rcu_cpu_stall_suppress`` to suppress the splats. This
kernel parameter may also be set via ``sysfs``. Furthermore, RCU CPU
stall warnings are counter-productive during sysrq dumps and during
- panics. RCU therefore supplies the ``rcu_sysrq_start()`` and
- ``rcu_sysrq_end()`` API members to be called before and after long
- sysrq dumps. RCU also supplies the ``rcu_panic()`` notifier that is
+ panics. RCU therefore supplies the rcu_sysrq_start() and
+ rcu_sysrq_end() API members to be called before and after long
+ sysrq dumps. RCU also supplies the rcu_panic() notifier that is
automatically invoked at the beginning of a panic to suppress further
RCU CPU stall warnings.
@@ -1656,7 +1655,7 @@ against mishaps and misuse:
synchronization mechanism, for example, reference counting.
#. In kernels built with ``CONFIG_RCU_TRACE=y``, RCU-related information
is provided via event tracing.
-#. Open-coded use of ``rcu_assign_pointer()`` and ``rcu_dereference()``
+#. Open-coded use of rcu_assign_pointer() and rcu_dereference()
to create typical linked data structures can be surprisingly
error-prone. Therefore, RCU-protected `linked
lists <https://lwn.net/Articles/609973/#RCU%20List%20APIs>`__ and,
@@ -1665,12 +1664,11 @@ against mishaps and misuse:
other special-purpose RCU-protected data structures are available in
the Linux kernel and the userspace RCU library.
#. Some linked structures are created at compile time, but still require
- ``__rcu`` checking. The ``RCU_POINTER_INITIALIZER()`` macro serves
+ ``__rcu`` checking. The RCU_POINTER_INITIALIZER() macro serves
this purpose.
-#. It is not necessary to use ``rcu_assign_pointer()`` when creating
+#. It is not necessary to use rcu_assign_pointer() when creating
linked structures that are to be published via a single external
- pointer. The ``RCU_INIT_POINTER()`` macro is provided for this task
- and also for assigning ``NULL`` pointers at runtime.
+ pointer. The RCU_INIT_POINTER() macro is provided for this task.
This not a hard-and-fast list: RCU's diagnostic capabilities will
continue to be guided by the number and type of usage bugs found in
@@ -1716,7 +1714,7 @@ requires almost all of them be hidden behind a ``CONFIG_RCU_EXPERT``
This all should be quite obvious, but the fact remains that Linus
Torvalds recently had to
-`remind <https://lkml.kernel.org/g/CA+55aFy4wcCwaL4okTs8wXhGZ5h-ibecy_Meg9C4MNQrUnwMcg@mail.gmail.com>`__
+`remind <https://lore.kernel.org/r/CA+55aFy4wcCwaL4okTs8wXhGZ5h-ibecy_Meg9C4MNQrUnwMcg@mail.gmail.com>`__
me of this requirement.
Firmware Interface
@@ -1743,17 +1741,17 @@ Early Boot
~~~~~~~~~~
The Linux kernel's boot sequence is an interesting process, and RCU is
-used early, even before ``rcu_init()`` is invoked. In fact, a number of
+used early, even before rcu_init() is invoked. In fact, a number of
RCU's primitives can be used as soon as the initial task's
``task_struct`` is available and the boot CPU's per-CPU variables are
-set up. The read-side primitives (``rcu_read_lock()``,
-``rcu_read_unlock()``, ``rcu_dereference()``, and
-``rcu_access_pointer()``) will operate normally very early on, as will
-``rcu_assign_pointer()``.
+set up. The read-side primitives (rcu_read_lock(),
+rcu_read_unlock(), rcu_dereference(), and
+rcu_access_pointer()) will operate normally very early on, as will
+rcu_assign_pointer().
-Although ``call_rcu()`` may be invoked at any time during boot,
+Although call_rcu() may be invoked at any time during boot,
callbacks are not guaranteed to be invoked until after all of RCU's
-kthreads have been spawned, which occurs at ``early_initcall()`` time.
+kthreads have been spawned, which occurs at early_initcall() time.
This delay in callback invocation is due to the fact that RCU does not
invoke callbacks until it is fully initialized, and this full
initialization cannot occur until after the scheduler has initialized
@@ -1762,22 +1760,22 @@ it would be possible to invoke callbacks earlier, however, this is not a
panacea because there would be severe restrictions on what operations
those callbacks could invoke.
-Perhaps surprisingly, ``synchronize_rcu()`` and
-``synchronize_rcu_expedited()``, will operate normally during very early
+Perhaps surprisingly, synchronize_rcu() and
+synchronize_rcu_expedited(), will operate normally during very early
boot, the reason being that there is only one CPU and preemption is
-disabled. This means that the call ``synchronize_rcu()`` (or friends)
+disabled. This means that the call synchronize_rcu() (or friends)
itself is a quiescent state and thus a grace period, so the early-boot
implementation can be a no-op.
However, once the scheduler has spawned its first kthread, this early
-boot trick fails for ``synchronize_rcu()`` (as well as for
-``synchronize_rcu_expedited()``) in ``CONFIG_PREEMPT=y`` kernels. The
+boot trick fails for synchronize_rcu() (as well as for
+synchronize_rcu_expedited()) in ``CONFIG_PREEMPTION=y`` kernels. The
reason is that an RCU read-side critical section might be preempted,
-which means that a subsequent ``synchronize_rcu()`` really does have to
+which means that a subsequent synchronize_rcu() really does have to
wait for something, as opposed to simply returning immediately.
-Unfortunately, ``synchronize_rcu()`` can't do this until all of its
+Unfortunately, synchronize_rcu() can't do this until all of its
kthreads are spawned, which doesn't happen until some time during
-``early_initcalls()`` time. But this is no excuse: RCU is nevertheless
+early_initcalls() time. But this is no excuse: RCU is nevertheless
required to correctly handle synchronous grace periods during this time
period. Once all of its kthreads are up and running, RCU starts running
normally.
@@ -1820,7 +1818,7 @@ Interrupts and NMIs
The Linux kernel has interrupts, and RCU read-side critical sections are
legal within interrupt handlers and within interrupt-disabled regions of
-code, as are invocations of ``call_rcu()``.
+code, as are invocations of call_rcu().
Some Linux-kernel architectures can enter an interrupt handler from
non-idle process context, and then just never leave it, instead
@@ -1832,22 +1830,22 @@ way during a rewrite of RCU's dyntick-idle code.
The Linux kernel has non-maskable interrupts (NMIs), and RCU read-side
critical sections are legal within NMI handlers. Thankfully, RCU
-update-side primitives, including ``call_rcu()``, are prohibited within
+update-side primitives, including call_rcu(), are prohibited within
NMI handlers.
The name notwithstanding, some Linux-kernel architectures can have
nested NMIs, which RCU must handle correctly. Andy Lutomirski `surprised
-me <https://lkml.kernel.org/r/CALCETrXLq1y7e_dKFPgou-FKHB6Pu-r8+t-6Ds+8=va7anBWDA@mail.gmail.com>`__
+me <https://lore.kernel.org/r/CALCETrXLq1y7e_dKFPgou-FKHB6Pu-r8+t-6Ds+8=va7anBWDA@mail.gmail.com>`__
with this requirement; he also kindly surprised me with `an
-algorithm <https://lkml.kernel.org/r/CALCETrXSY9JpW3uE6H8WYk81sg56qasA2aqmjMPsq5dOtzso=g@mail.gmail.com>`__
+algorithm <https://lore.kernel.org/r/CALCETrXSY9JpW3uE6H8WYk81sg56qasA2aqmjMPsq5dOtzso=g@mail.gmail.com>`__
that meets this requirement.
Furthermore, NMI handlers can be interrupted by what appear to RCU to be
normal interrupts. One way that this can happen is for code that
-directly invokes ``rcu_irq_enter()`` and ``rcu_irq_exit()`` to be called
+directly invokes rcu_irq_enter() and rcu_irq_exit() to be called
from an NMI handler. This astonishing fact of life prompted the current
-code structure, which has ``rcu_irq_enter()`` invoking
-``rcu_nmi_enter()`` and ``rcu_irq_exit()`` invoking ``rcu_nmi_exit()``.
+code structure, which has rcu_irq_enter() invoking
+rcu_nmi_enter() and rcu_irq_exit() invoking rcu_nmi_exit().
And yes, I also learned of this requirement the hard way.
Loadable Modules
@@ -1857,45 +1855,45 @@ The Linux kernel has loadable modules, and these modules can also be
unloaded. After a given module has been unloaded, any attempt to call
one of its functions results in a segmentation fault. The module-unload
functions must therefore cancel any delayed calls to loadable-module
-functions, for example, any outstanding ``mod_timer()`` must be dealt
-with via ``del_timer_sync()`` or similar.
+functions, for example, any outstanding mod_timer() must be dealt
+with via del_timer_sync() or similar.
Unfortunately, there is no way to cancel an RCU callback; once you
-invoke ``call_rcu()``, the callback function is eventually going to be
+invoke call_rcu(), the callback function is eventually going to be
invoked, unless the system goes down first. Because it is normally
considered socially irresponsible to crash the system in response to a
module unload request, we need some other way to deal with in-flight RCU
callbacks.
-RCU therefore provides ``rcu_barrier()``, which waits until all
+RCU therefore provides rcu_barrier(), which waits until all
in-flight RCU callbacks have been invoked. If a module uses
-``call_rcu()``, its exit function should therefore prevent any future
-invocation of ``call_rcu()``, then invoke ``rcu_barrier()``. In theory,
-the underlying module-unload code could invoke ``rcu_barrier()``
+call_rcu(), its exit function should therefore prevent any future
+invocation of call_rcu(), then invoke rcu_barrier(). In theory,
+the underlying module-unload code could invoke rcu_barrier()
unconditionally, but in practice this would incur unacceptable
latencies.
Nikita Danilov noted this requirement for an analogous
filesystem-unmount situation, and Dipankar Sarma incorporated
-``rcu_barrier()`` into RCU. The need for ``rcu_barrier()`` for module
+rcu_barrier() into RCU. The need for rcu_barrier() for module
unloading became apparent later.
.. important::
- The ``rcu_barrier()`` function is not, repeat,
+ The rcu_barrier() function is not, repeat,
*not*, obligated to wait for a grace period. It is instead only required
to wait for RCU callbacks that have already been posted. Therefore, if
there are no RCU callbacks posted anywhere in the system,
- ``rcu_barrier()`` is within its rights to return immediately. Even if
- there are callbacks posted, ``rcu_barrier()`` does not necessarily need
+ rcu_barrier() is within its rights to return immediately. Even if
+ there are callbacks posted, rcu_barrier() does not necessarily need
to wait for a grace period.
+-----------------------------------------------------------------------+
| **Quick Quiz**: |
+-----------------------------------------------------------------------+
| Wait a minute! Each RCU callbacks must wait for a grace period to |
-| complete, and ``rcu_barrier()`` must wait for each pre-existing |
-| callback to be invoked. Doesn't ``rcu_barrier()`` therefore need to |
+| complete, and rcu_barrier() must wait for each pre-existing |
+| callback to be invoked. Doesn't rcu_barrier() therefore need to |
| wait for a full grace period if there is even one callback posted |
| anywhere in the system? |
+-----------------------------------------------------------------------+
@@ -1904,14 +1902,14 @@ unloading became apparent later.
| Absolutely not!!! |
| Yes, each RCU callbacks must wait for a grace period to complete, but |
| it might well be partly (or even completely) finished waiting by the |
-| time ``rcu_barrier()`` is invoked. In that case, ``rcu_barrier()`` |
+| time rcu_barrier() is invoked. In that case, rcu_barrier() |
| need only wait for the remaining portion of the grace period to |
| elapse. So even if there are quite a few callbacks posted, |
-| ``rcu_barrier()`` might well return quite quickly. |
+| rcu_barrier() might well return quite quickly. |
| |
| So if you need to wait for a grace period as well as for all |
| pre-existing callbacks, you will need to invoke both |
-| ``synchronize_rcu()`` and ``rcu_barrier()``. If latency is a concern, |
+| synchronize_rcu() and rcu_barrier(). If latency is a concern, |
| you can always use workqueues to invoke them concurrently. |
+-----------------------------------------------------------------------+
@@ -1929,18 +1927,18 @@ The Linux-kernel CPU-hotplug implementation has notifiers that are used
to allow the various kernel subsystems (including RCU) to respond
appropriately to a given CPU-hotplug operation. Most RCU operations may
be invoked from CPU-hotplug notifiers, including even synchronous
-grace-period operations such as (``synchronize_rcu()`` and
-``synchronize_rcu_expedited()``). However, these synchronous operations
+grace-period operations such as (synchronize_rcu() and
+synchronize_rcu_expedited()). However, these synchronous operations
do block and therefore cannot be invoked from notifiers that execute via
-``stop_machine()``, specifically those between the ``CPUHP_AP_OFFLINE``
+stop_machine(), specifically those between the ``CPUHP_AP_OFFLINE``
and ``CPUHP_AP_ONLINE`` states.
-In addition, all-callback-wait operations such as ``rcu_barrier()`` may
+In addition, all-callback-wait operations such as rcu_barrier() may
not be invoked from any CPU-hotplug notifier. This restriction is due
to the fact that there are phases of CPU-hotplug operations where the
outgoing CPU's callbacks will not be invoked until after the CPU-hotplug
operation ends, which could also result in deadlock. Furthermore,
-``rcu_barrier()`` blocks CPU-hotplug operations during its execution,
+rcu_barrier() blocks CPU-hotplug operations during its execution,
which results in another type of deadlock when invoked from a CPU-hotplug
notifier.
@@ -1955,12 +1953,12 @@ if offline CPUs block an RCU grace period for too long.
An offline CPU's quiescent state will be reported either:
-1. As the CPU goes offline using RCU's hotplug notifier (``rcu_report_dead()``).
-2. When grace period initialization (``rcu_gp_init()``) detects a
+1. As the CPU goes offline using RCU's hotplug notifier (rcu_report_dead()).
+2. When grace period initialization (rcu_gp_init()) detects a
race either with CPU offlining or with a task unblocking on a leaf
``rcu_node`` structure whose CPUs are all offline.
-The CPU-online path (``rcu_cpu_starting()``) should never need to report
+The CPU-online path (rcu_cpu_starting()) should never need to report
a quiescent state for an offline CPU. However, as a debugging measure,
it does emit a warning if a quiescent state was not already reported
for that CPU.
@@ -1984,11 +1982,11 @@ room for further improvement.
There is no longer any prohibition against holding any of
scheduler's runqueue or priority-inheritance spinlocks across an
-``rcu_read_unlock()``, even if interrupts and preemption were enabled
+rcu_read_unlock(), even if interrupts and preemption were enabled
somewhere within the corresponding RCU read-side critical section.
-Therefore, it is now perfectly legal to execute ``rcu_read_lock()``
+Therefore, it is now perfectly legal to execute rcu_read_lock()
with preemption enabled, acquire one of the scheduler locks, and hold
-that lock across the matching ``rcu_read_unlock()``.
+that lock across the matching rcu_read_unlock().
Similarly, the RCU flavor consolidation has removed the need for negative
nesting. The fact that interrupt-disabled regions of code act as RCU
@@ -1999,7 +1997,7 @@ Tracing and RCU
~~~~~~~~~~~~~~~
It is possible to use tracing on RCU code, but tracing itself uses RCU.
-For this reason, ``rcu_dereference_raw_check()`` is provided for use
+For this reason, rcu_dereference_raw_check() is provided for use
by tracing, which avoids the destructive recursion that could otherwise
ensue. This API is also used by virtualization in some architectures,
where RCU readers execute in environments in which tracing cannot be
@@ -2010,12 +2008,12 @@ Accesses to User Memory and RCU
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The kernel needs to access user-space memory, for example, to access data
-referenced by system-call parameters. The ``get_user()`` macro does this job.
+referenced by system-call parameters. The get_user() macro does this job.
However, user-space memory might well be paged out, which means that
-``get_user()`` might well page-fault and thus block while waiting for the
+get_user() might well page-fault and thus block while waiting for the
resulting I/O to complete. It would be a very bad thing for the compiler to
-reorder a ``get_user()`` invocation into an RCU read-side critical section.
+reorder a get_user() invocation into an RCU read-side critical section.
For example, suppose that the source code looked like this:
@@ -2040,23 +2038,23 @@ the following:
5 rcu_read_unlock();
6 do_something_with(v, user_v);
-If the compiler did make this transformation in a ``CONFIG_PREEMPT=n`` kernel
-build, and if ``get_user()`` did page fault, the result would be a quiescent
+If the compiler did make this transformation in a ``CONFIG_PREEMPTION=n`` kernel
+build, and if get_user() did page fault, the result would be a quiescent
state in the middle of an RCU read-side critical section. This misplaced
quiescent state could result in line 4 being a use-after-free access,
which could be bad for your kernel's actuarial statistics. Similar examples
-can be constructed with the call to ``get_user()`` preceding the
-``rcu_read_lock()``.
+can be constructed with the call to get_user() preceding the
+rcu_read_lock().
-Unfortunately, ``get_user()`` doesn't have any particular ordering properties,
+Unfortunately, get_user() doesn't have any particular ordering properties,
and in some architectures the underlying ``asm`` isn't even marked
``volatile``. And even if it was marked ``volatile``, the above access to
``p->value`` is not volatile, so the compiler would not have any reason to keep
those two accesses in order.
-Therefore, the Linux-kernel definitions of ``rcu_read_lock()`` and
-``rcu_read_unlock()`` must act as compiler barriers, at least for outermost
-instances of ``rcu_read_lock()`` and ``rcu_read_unlock()`` within a nested set
+Therefore, the Linux-kernel definitions of rcu_read_lock() and
+rcu_read_unlock() must act as compiler barriers, at least for outermost
+instances of rcu_read_lock() and rcu_read_unlock() within a nested set
of RCU read-side critical sections.
Energy Efficiency
@@ -2071,26 +2069,26 @@ call.
Because RCU avoids interrupting idle CPUs, it is illegal to execute an
RCU read-side critical section on an idle CPU. (Kernels built with
-``CONFIG_PROVE_RCU=y`` will splat if you try it.) The ``RCU_NONIDLE()``
+``CONFIG_PROVE_RCU=y`` will splat if you try it.) The RCU_NONIDLE()
macro and ``_rcuidle`` event tracing is provided to work around this
-restriction. In addition, ``rcu_is_watching()`` may be used to test
+restriction. In addition, rcu_is_watching() may be used to test
whether or not it is currently legal to run RCU read-side critical
sections on this CPU. I learned of the need for diagnostics on the one
-hand and ``RCU_NONIDLE()`` on the other while inspecting idle-loop code.
+hand and RCU_NONIDLE() on the other while inspecting idle-loop code.
Steven Rostedt supplied ``_rcuidle`` event tracing, which is used quite
heavily in the idle loop. However, there are some restrictions on the
-code placed within ``RCU_NONIDLE()``:
+code placed within RCU_NONIDLE():
#. Blocking is prohibited. In practice, this is not a serious
restriction given that idle tasks are prohibited from blocking to
begin with.
-#. Although nesting ``RCU_NONIDLE()`` is permitted, they cannot nest
+#. Although nesting RCU_NONIDLE() is permitted, they cannot nest
indefinitely deeply. However, given that they can be nested on the
order of a million deep, even on 32-bit systems, this should not be a
serious restriction. This nesting limit would probably be reached
long after the compiler OOMed or the stack overflowed.
-#. Any code path that enters ``RCU_NONIDLE()`` must sequence out of that
- same ``RCU_NONIDLE()``. For example, the following is grossly
+#. Any code path that enters RCU_NONIDLE() must sequence out of that
+ same RCU_NONIDLE(). For example, the following is grossly
illegal:
::
@@ -2103,7 +2101,7 @@ code placed within ``RCU_NONIDLE()``:
It is just as illegal to transfer control into the middle of
- ``RCU_NONIDLE()``'s argument. Yes, in theory, you could transfer in
+ RCU_NONIDLE()'s argument. Yes, in theory, you could transfer in
as long as you also transferred out, but in practice you could also
expect to get sharply worded review comments.
@@ -2195,9 +2193,9 @@ scheduling-clock interrupt be enabled when RCU needs it to be:
sections, and RCU believes this CPU to be idle, no problem. This
sort of thing is used by some architectures for light-weight
exception handlers, which can then avoid the overhead of
- ``rcu_irq_enter()`` and ``rcu_irq_exit()`` at exception entry and
+ rcu_irq_enter() and rcu_irq_exit() at exception entry and
exit, respectively. Some go further and avoid the entireties of
- ``irq_enter()`` and ``irq_exit()``.
+ irq_enter() and irq_exit().
Just make very sure you are running some of your tests with
``CONFIG_PROVE_RCU=y``, just in case one of your code paths was in
fact joking about not doing RCU read-side critical sections.
@@ -2221,7 +2219,7 @@ scheduling-clock interrupt be enabled when RCU needs it to be:
| **Quick Quiz**: |
+-----------------------------------------------------------------------+
| But what if my driver has a hardware interrupt handler that can run |
-| for many seconds? I cannot invoke ``schedule()`` from an hardware |
+| for many seconds? I cannot invoke schedule() from an hardware |
| interrupt handler, after all! |
+-----------------------------------------------------------------------+
| **Answer**: |
@@ -2243,8 +2241,8 @@ Memory Efficiency
Although small-memory non-realtime systems can simply use Tiny RCU, code
size is only one aspect of memory efficiency. Another aspect is the size
-of the ``rcu_head`` structure used by ``call_rcu()`` and
-``kfree_rcu()``. Although this structure contains nothing more than a
+of the ``rcu_head`` structure used by call_rcu() and
+kfree_rcu(). Although this structure contains nothing more than a
pair of pointers, it does appear in many RCU-protected data structures,
including some that are size critical. The ``page`` structure is a case
in point, as evidenced by the many occurrences of the ``union`` keyword
@@ -2254,7 +2252,7 @@ This need for memory efficiency is one reason that RCU uses hand-crafted
singly linked lists to track the ``rcu_head`` structures that are
waiting for a grace period to elapse. It is also the reason why
``rcu_head`` structures do not contain debug information, such as fields
-tracking the file and line of the ``call_rcu()`` or ``kfree_rcu()`` that
+tracking the file and line of the call_rcu() or kfree_rcu() that
posted them. Although this information might appear in debug-only kernel
builds at some point, in the meantime, the ``->func`` field will often
provide the needed debug information.
@@ -2264,18 +2262,18 @@ more extreme measures. Returning to the ``page`` structure, the
``rcu_head`` field shares storage with a great many other structures
that are used at various points in the corresponding page's lifetime. In
order to correctly resolve certain `race
-conditions <https://lkml.kernel.org/g/1439976106-137226-1-git-send-email-kirill.shutemov@linux.intel.com>`__,
+conditions <https://lore.kernel.org/r/1439976106-137226-1-git-send-email-kirill.shutemov@linux.intel.com>`__,
the Linux kernel's memory-management subsystem needs a particular bit to
remain zero during all phases of grace-period processing, and that bit
happens to map to the bottom bit of the ``rcu_head`` structure's
-``->next`` field. RCU makes this guarantee as long as ``call_rcu()`` is
-used to post the callback, as opposed to ``kfree_rcu()`` or some future
-“lazy” variant of ``call_rcu()`` that might one day be created for
+``->next`` field. RCU makes this guarantee as long as call_rcu() is
+used to post the callback, as opposed to kfree_rcu() or some future
+“lazy” variant of call_rcu() that might one day be created for
energy-efficiency purposes.
That said, there are limits. RCU requires that the ``rcu_head``
structure be aligned to a two-byte boundary, and passing a misaligned
-``rcu_head`` structure to one of the ``call_rcu()`` family of functions
+``rcu_head`` structure to one of the call_rcu() family of functions
will result in a splat. It is therefore necessary to exercise caution
when packing structures containing fields of type ``rcu_head``. Why not
a four-byte or even eight-byte alignment requirement? Because the m68k
@@ -2299,7 +2297,7 @@ hot code paths in performance-critical portions of the Linux kernel's
networking, security, virtualization, and scheduling code paths. RCU
must therefore use efficient implementations, especially in its
read-side primitives. To that end, it would be good if preemptible RCU's
-implementation of ``rcu_read_lock()`` could be inlined, however, doing
+implementation of rcu_read_lock() could be inlined, however, doing
this requires resolving ``#include`` issues with the ``task_struct``
structure.
@@ -2312,23 +2310,23 @@ on the ``rcu_node`` structure. RCU is required to tolerate all CPUs
continuously invoking any combination of RCU's runtime primitives with
minimal per-operation overhead. In fact, in many cases, increasing load
must *decrease* the per-operation overhead, witness the batching
-optimizations for ``synchronize_rcu()``, ``call_rcu()``,
-``synchronize_rcu_expedited()``, and ``rcu_barrier()``. As a general
+optimizations for synchronize_rcu(), call_rcu(),
+synchronize_rcu_expedited(), and rcu_barrier(). As a general
rule, RCU must cheerfully accept whatever the rest of the Linux kernel
decides to throw at it.
The Linux kernel is used for real-time workloads, especially in
conjunction with the `-rt
-patchset <https://rt.wiki.kernel.org/index.php/Main_Page>`__. The
+patchset <https://wiki.linuxfoundation.org/realtime/>`__. The
real-time-latency response requirements are such that the traditional
approach of disabling preemption across RCU read-side critical sections
-is inappropriate. Kernels built with ``CONFIG_PREEMPT=y`` therefore use
+is inappropriate. Kernels built with ``CONFIG_PREEMPTION=y`` therefore use
an RCU implementation that allows RCU read-side critical sections to be
preempted. This requirement made its presence known after users made it
clear that an earlier `real-time
patch <https://lwn.net/Articles/107930/>`__ did not meet their needs, in
conjunction with some `RCU
-issues <https://lkml.kernel.org/g/20050318002026.GA2693@us.ibm.com>`__
+issues <https://lore.kernel.org/r/20050318002026.GA2693@us.ibm.com>`__
encountered by a very early version of the -rt patchset.
In addition, RCU must make do with a sub-100-microsecond real-time
@@ -2346,7 +2344,7 @@ number of race conditions.
RCU must avoid degrading real-time response for CPU-bound threads,
whether executing in usermode (which is one use case for
``CONFIG_NO_HZ_FULL=y``) or in the kernel. That said, CPU-bound loops in
-the kernel must execute ``cond_resched()`` at least once per few tens of
+the kernel must execute cond_resched() at least once per few tens of
milliseconds in order to avoid receiving an IPI from RCU.
Finally, RCU's status as a synchronization primitive means that any RCU
@@ -2412,7 +2410,7 @@ grace periods from ever ending. The result was an out-of-memory
condition and a system hang.
The solution was the creation of RCU-bh, which does
-``local_bh_disable()`` across its read-side critical sections, and which
+local_bh_disable() across its read-side critical sections, and which
uses the transition from one type of softirq processing to another as a
quiescent state in addition to context switch, idle, user mode, and
offline. This means that RCU-bh grace periods can complete even when
@@ -2420,31 +2418,31 @@ some of the CPUs execute in softirq indefinitely, thus allowing
algorithms based on RCU-bh to withstand network-based denial-of-service
attacks.
-Because ``rcu_read_lock_bh()`` and ``rcu_read_unlock_bh()`` disable and
+Because rcu_read_lock_bh() and rcu_read_unlock_bh() disable and
re-enable softirq handlers, any attempt to start a softirq handlers
during the RCU-bh read-side critical section will be deferred. In this
-case, ``rcu_read_unlock_bh()`` will invoke softirq processing, which can
+case, rcu_read_unlock_bh() will invoke softirq processing, which can
take considerable time. One can of course argue that this softirq
overhead should be associated with the code following the RCU-bh
-read-side critical section rather than ``rcu_read_unlock_bh()``, but the
+read-side critical section rather than rcu_read_unlock_bh(), but the
fact is that most profiling tools cannot be expected to make this sort
of fine distinction. For example, suppose that a three-millisecond-long
RCU-bh read-side critical section executes during a time of heavy
networking load. There will very likely be an attempt to invoke at least
one softirq handler during that three milliseconds, but any such
invocation will be delayed until the time of the
-``rcu_read_unlock_bh()``. This can of course make it appear at first
-glance as if ``rcu_read_unlock_bh()`` was executing very slowly.
+rcu_read_unlock_bh(). This can of course make it appear at first
+glance as if rcu_read_unlock_bh() was executing very slowly.
The `RCU-bh
API <https://lwn.net/Articles/609973/#RCU%20Per-Flavor%20API%20Table>`__
-includes ``rcu_read_lock_bh()``, ``rcu_read_unlock_bh()``,
-``rcu_dereference_bh()``, ``rcu_dereference_bh_check()``,
-``synchronize_rcu_bh()``, ``synchronize_rcu_bh_expedited()``,
-``call_rcu_bh()``, ``rcu_barrier_bh()``, and
-``rcu_read_lock_bh_held()``. However, the update-side APIs are now
-simple wrappers for other RCU flavors, namely RCU-sched in
-CONFIG_PREEMPT=n kernels and RCU-preempt otherwise.
+includes rcu_read_lock_bh(), rcu_read_unlock_bh(), rcu_dereference_bh(),
+rcu_dereference_bh_check(), and rcu_read_lock_bh_held(). However, the
+old RCU-bh update-side APIs are now gone, replaced by synchronize_rcu(),
+synchronize_rcu_expedited(), call_rcu(), and rcu_barrier(). In addition,
+anything that disables bottom halves also marks an RCU-bh read-side
+critical section, including local_bh_disable() and local_bh_enable(),
+local_irq_save() and local_irq_restore(), and so on.
Sched Flavor (Historical)
~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -2462,32 +2460,32 @@ not have this property, given that any point in the code outside of an
RCU read-side critical section can be a quiescent state. Therefore,
*RCU-sched* was created, which follows “classic” RCU in that an
RCU-sched grace period waits for pre-existing interrupt and NMI
-handlers. In kernels built with ``CONFIG_PREEMPT=n``, the RCU and
+handlers. In kernels built with ``CONFIG_PREEMPTION=n``, the RCU and
RCU-sched APIs have identical implementations, while kernels built with
-``CONFIG_PREEMPT=y`` provide a separate implementation for each.
+``CONFIG_PREEMPTION=y`` provide a separate implementation for each.
-Note well that in ``CONFIG_PREEMPT=y`` kernels,
-``rcu_read_lock_sched()`` and ``rcu_read_unlock_sched()`` disable and
+Note well that in ``CONFIG_PREEMPTION=y`` kernels,
+rcu_read_lock_sched() and rcu_read_unlock_sched() disable and
re-enable preemption, respectively. This means that if there was a
preemption attempt during the RCU-sched read-side critical section,
-``rcu_read_unlock_sched()`` will enter the scheduler, with all the
-latency and overhead entailed. Just as with ``rcu_read_unlock_bh()``,
-this can make it look as if ``rcu_read_unlock_sched()`` was executing
+rcu_read_unlock_sched() will enter the scheduler, with all the
+latency and overhead entailed. Just as with rcu_read_unlock_bh(),
+this can make it look as if rcu_read_unlock_sched() was executing
very slowly. However, the highest-priority task won't be preempted, so
-that task will enjoy low-overhead ``rcu_read_unlock_sched()``
+that task will enjoy low-overhead rcu_read_unlock_sched()
invocations.
The `RCU-sched
API <https://lwn.net/Articles/609973/#RCU%20Per-Flavor%20API%20Table>`__
-includes ``rcu_read_lock_sched()``, ``rcu_read_unlock_sched()``,
-``rcu_read_lock_sched_notrace()``, ``rcu_read_unlock_sched_notrace()``,
-``rcu_dereference_sched()``, ``rcu_dereference_sched_check()``,
-``synchronize_sched()``, ``synchronize_rcu_sched_expedited()``,
-``call_rcu_sched()``, ``rcu_barrier_sched()``, and
-``rcu_read_lock_sched_held()``. However, anything that disables
-preemption also marks an RCU-sched read-side critical section, including
-``preempt_disable()`` and ``preempt_enable()``, ``local_irq_save()`` and
-``local_irq_restore()``, and so on.
+includes rcu_read_lock_sched(), rcu_read_unlock_sched(),
+rcu_read_lock_sched_notrace(), rcu_read_unlock_sched_notrace(),
+rcu_dereference_sched(), rcu_dereference_sched_check(), and
+rcu_read_lock_sched_held(). However, the old RCU-sched update-side APIs
+are now gone, replaced by synchronize_rcu(), synchronize_rcu_expedited(),
+call_rcu(), and rcu_barrier(). In addition, anything that disables
+preemption also marks an RCU-sched read-side critical section,
+including preempt_disable() and preempt_enable(), local_irq_save()
+and local_irq_restore(), and so on.
Sleepable RCU
~~~~~~~~~~~~~
@@ -2509,7 +2507,7 @@ this structure must be passed in to each SRCU function, for example,
structure. The key benefit of these domains is that a slow SRCU reader
in one domain does not delay an SRCU grace period in some other domain.
That said, one consequence of these domains is that read-side code must
-pass a “cookie” from ``srcu_read_lock()`` to ``srcu_read_unlock()``, for
+pass a “cookie” from srcu_read_lock() to srcu_read_unlock(), for
example, as follows:
::
@@ -2539,24 +2537,24 @@ period to elapse. For example, this results in a self-deadlock:
6 srcu_read_unlock(&ss, idx);
However, if line 5 acquired a mutex that was held across a
-``synchronize_srcu()`` for domain ``ss``, deadlock would still be
+synchronize_srcu() for domain ``ss``, deadlock would still be
possible. Furthermore, if line 5 acquired a mutex that was held across a
-``synchronize_srcu()`` for some other domain ``ss1``, and if an
+synchronize_srcu() for some other domain ``ss1``, and if an
``ss1``-domain SRCU read-side critical section acquired another mutex
-that was held across as ``ss``-domain ``synchronize_srcu()``, deadlock
+that was held across as ``ss``-domain synchronize_srcu(), deadlock
would again be possible. Such a deadlock cycle could extend across an
arbitrarily large number of different SRCU domains. Again, with great
power comes great responsibility.
Unlike the other RCU flavors, SRCU read-side critical sections can run
on idle and even offline CPUs. This ability requires that
-``srcu_read_lock()`` and ``srcu_read_unlock()`` contain memory barriers,
+srcu_read_lock() and srcu_read_unlock() contain memory barriers,
which means that SRCU readers will run a bit slower than would RCU
-readers. It also motivates the ``smp_mb__after_srcu_read_unlock()`` API,
-which, in combination with ``srcu_read_unlock()``, guarantees a full
+readers. It also motivates the smp_mb__after_srcu_read_unlock() API,
+which, in combination with srcu_read_unlock(), guarantees a full
memory barrier.
-Also unlike other RCU flavors, ``synchronize_srcu()`` may **not** be
+Also unlike other RCU flavors, synchronize_srcu() may **not** be
invoked from CPU-hotplug notifiers, due to the fact that SRCU grace
periods make use of timers and the possibility of timers being
temporarily “stranded” on the outgoing CPU. This stranding of timers
@@ -2565,7 +2563,7 @@ the CPU-hotplug process. The problem is that if a notifier is waiting on
an SRCU grace period, that grace period is waiting on a timer, and that
timer is stranded on the outgoing CPU, then the notifier will never be
awakened, in other words, deadlock has occurred. This same situation of
-course also prohibits ``srcu_barrier()`` from being invoked from
+course also prohibits srcu_barrier() from being invoked from
CPU-hotplug notifiers.
SRCU also differs from other RCU flavors in that SRCU's expedited and
@@ -2576,12 +2574,12 @@ have not yet completed. (But please note that this is a property of the
current implementation, not necessarily of future implementations.) In
addition, if SRCU has been idle for longer than the interval specified
by the ``srcutree.exp_holdoff`` kernel boot parameter (25 microseconds
-by default), and if a ``synchronize_srcu()`` invocation ends this idle
+by default), and if a synchronize_srcu() invocation ends this idle
period, that invocation will be automatically expedited.
As of v4.12, SRCU's callbacks are maintained per-CPU, eliminating a
locking bottleneck present in prior kernel versions. Although this will
-allow users to put much heavier stress on ``call_srcu()``, it is
+allow users to put much heavier stress on call_srcu(), it is
important to note that SRCU does not yet take any special steps to deal
with callback flooding. So if you are posting (say) 10,000 SRCU
callbacks per second per CPU, you are probably totally OK, but if you
@@ -2592,14 +2590,32 @@ of your CPUs and the size of your memory.
The `SRCU
API <https://lwn.net/Articles/609973/#RCU%20Per-Flavor%20API%20Table>`__
-includes ``srcu_read_lock()``, ``srcu_read_unlock()``,
-``srcu_dereference()``, ``srcu_dereference_check()``,
-``synchronize_srcu()``, ``synchronize_srcu_expedited()``,
-``call_srcu()``, ``srcu_barrier()``, and ``srcu_read_lock_held()``. It
-also includes ``DEFINE_SRCU()``, ``DEFINE_STATIC_SRCU()``, and
-``init_srcu_struct()`` APIs for defining and initializing
+includes srcu_read_lock(), srcu_read_unlock(),
+srcu_dereference(), srcu_dereference_check(),
+synchronize_srcu(), synchronize_srcu_expedited(),
+call_srcu(), srcu_barrier(), and srcu_read_lock_held(). It
+also includes DEFINE_SRCU(), DEFINE_STATIC_SRCU(), and
+init_srcu_struct() APIs for defining and initializing
``srcu_struct`` structures.
+More recently, the SRCU API has added polling interfaces:
+
+#. start_poll_synchronize_srcu() returns a cookie identifying
+ the completion of a future SRCU grace period and ensures
+ that this grace period will be started.
+#. poll_state_synchronize_srcu() returns ``true`` iff the
+ specified cookie corresponds to an already-completed
+ SRCU grace period.
+#. get_state_synchronize_srcu() returns a cookie just like
+ start_poll_synchronize_srcu() does, but differs in that
+ it does nothing to ensure that any future SRCU grace period
+ will be started.
+
+These functions are used to avoid unnecessary SRCU grace periods in
+certain types of buffer-cache algorithms having multi-stage age-out
+mechanisms. The idea is that by the time the block has aged completely
+from the cache, an SRCU grace period will be very likely to have elapsed.
+
Tasks RCU
~~~~~~~~~
@@ -2608,11 +2624,11 @@ required to install different types of probes. It would be good to be
able to free old trampolines, which sounds like a job for some form of
RCU. However, because it is necessary to be able to install a trace
anywhere in the code, it is not possible to use read-side markers such
-as ``rcu_read_lock()`` and ``rcu_read_unlock()``. In addition, it does
+as rcu_read_lock() and rcu_read_unlock(). In addition, it does
not work to have these markers in the trampoline itself, because there
-would need to be instructions following ``rcu_read_unlock()``. Although
-``synchronize_rcu()`` would guarantee that execution reached the
-``rcu_read_unlock()``, it would not be able to guarantee that execution
+would need to be instructions following rcu_read_unlock(). Although
+synchronize_rcu() would guarantee that execution reached the
+rcu_read_unlock(), it would not be able to guarantee that execution
had completely left the trampoline. Worse yet, in some situations
the trampoline's protection must extend a few instructions *prior* to
execution reaching the trampoline. For example, these few instructions
@@ -2623,16 +2639,16 @@ actually reached the trampoline itself.
The solution, in the form of `Tasks
RCU <https://lwn.net/Articles/607117/>`__, is to have implicit read-side
critical sections that are delimited by voluntary context switches, that
-is, calls to ``schedule()``, ``cond_resched()``, and
-``synchronize_rcu_tasks()``. In addition, transitions to and from
+is, calls to schedule(), cond_resched(), and
+synchronize_rcu_tasks(). In addition, transitions to and from
userspace execution also delimit tasks-RCU read-side critical sections.
The tasks-RCU API is quite compact, consisting only of
-``call_rcu_tasks()``, ``synchronize_rcu_tasks()``, and
-``rcu_barrier_tasks()``. In ``CONFIG_PREEMPT=n`` kernels, trampolines
-cannot be preempted, so these APIs map to ``call_rcu()``,
-``synchronize_rcu()``, and ``rcu_barrier()``, respectively. In
-``CONFIG_PREEMPT=y`` kernels, trampolines can be preempted, and these
+call_rcu_tasks(), synchronize_rcu_tasks(), and
+rcu_barrier_tasks(). In ``CONFIG_PREEMPTION=n`` kernels, trampolines
+cannot be preempted, so these APIs map to call_rcu(),
+synchronize_rcu(), and rcu_barrier(), respectively. In
+``CONFIG_PREEMPTION=y`` kernels, trampolines can be preempted, and these
three APIs are therefore implemented by separate functions that check
for voluntary context switches.
@@ -2646,8 +2662,8 @@ grace-period state machine so as to avoid the need for the additional
latency.
RCU disables CPU hotplug in a few places, perhaps most notably in the
-``rcu_barrier()`` operations. If there is a strong reason to use
-``rcu_barrier()`` in CPU-hotplug notifiers, it will be necessary to
+rcu_barrier() operations. If there is a strong reason to use
+rcu_barrier() in CPU-hotplug notifiers, it will be necessary to
avoid disabling CPU hotplug. This would introduce some complexity, so
there had better be a *very* good reason.
@@ -2664,7 +2680,7 @@ However, this combining tree does not spread its memory across NUMA
nodes nor does it align the CPU groups with hardware features such as
sockets or cores. Such spreading and alignment is currently believed to
be unnecessary because the hotpath read-side primitives do not access
-the combining tree, nor does ``call_rcu()`` in the common case. If you
+the combining tree, nor does call_rcu() in the common case. If you
believe that your architecture needs such spreading and alignment, then
your architecture should also benefit from the
``rcutree.rcu_fanout_leaf`` boot parameter, which can be set to the
@@ -2685,7 +2701,7 @@ likely that adjustments will be required to more gracefully handle
extreme loads. It might also be necessary to be able to relate CPU
utilization by RCU's kthreads and softirq handlers to the code that
instigated this CPU utilization. For example, RCU callback overhead
-might be charged back to the originating ``call_rcu()`` instance, though
+might be charged back to the originating call_rcu() instance, though
probably not in production kernels.
Additional work may be required to provide reasonable forward-progress
diff --git a/Documentation/RCU/checklist.rst b/Documentation/RCU/checklist.rst
index bb7128eb322ef..1030119294d08 100644
--- a/Documentation/RCU/checklist.rst
+++ b/Documentation/RCU/checklist.rst
@@ -70,7 +70,7 @@ over a rather long period of time, but improvements are always welcome!
is less readable and prevents lockdep from detecting locking issues.
Letting RCU-protected pointers "leak" out of an RCU read-side
- critical section is every bid as bad as letting them leak out
+ critical section is every bit as bad as letting them leak out
from under a lock. Unless, of course, you have arranged some
other means of protection, such as a lock or a reference count
-before- letting them out of the RCU read-side critical section.
@@ -129,9 +129,7 @@ over a rather long period of time, but improvements are always welcome!
accesses. The rcu_dereference() primitive ensures that
the CPU picks up the pointer before it picks up the data
that the pointer points to. This really is necessary
- on Alpha CPUs. If you don't believe me, see:
-
- http://www.openvms.compaq.com/wizard/wiz_2637.html
+ on Alpha CPUs.
The rcu_dereference() primitive is also an excellent
documentation aid, letting the person reading the
@@ -214,9 +212,9 @@ over a rather long period of time, but improvements are always welcome!
the rest of the system.
7. As of v4.20, a given kernel implements only one RCU flavor,
- which is RCU-sched for PREEMPT=n and RCU-preempt for PREEMPT=y.
+ which is RCU-sched for PREEMPTION=n and RCU-preempt for PREEMPTION=y.
If the updater uses call_rcu() or synchronize_rcu(),
- then the corresponding readers my use rcu_read_lock() and
+ then the corresponding readers may use rcu_read_lock() and
rcu_read_unlock(), rcu_read_lock_bh() and rcu_read_unlock_bh(),
or any pair of primitives that disables and re-enables preemption,
for example, rcu_read_lock_sched() and rcu_read_unlock_sched().
diff --git a/Documentation/RCU/rcubarrier.rst b/Documentation/RCU/rcubarrier.rst
index f64f4413a47c4..3b4a248774961 100644
--- a/Documentation/RCU/rcubarrier.rst
+++ b/Documentation/RCU/rcubarrier.rst
@@ -9,7 +9,7 @@ RCU (read-copy update) is a synchronization mechanism that can be thought
of as a replacement for read-writer locking (among other things), but with
very low-overhead readers that are immune to deadlock, priority inversion,
and unbounded latency. RCU read-side critical sections are delimited
-by rcu_read_lock() and rcu_read_unlock(), which, in non-CONFIG_PREEMPT
+by rcu_read_lock() and rcu_read_unlock(), which, in non-CONFIG_PREEMPTION
kernels, generate no code whatsoever.
This means that RCU writers are unaware of the presence of concurrent
@@ -329,10 +329,10 @@ Answer: This cannot happen. The reason is that on_each_cpu() has its last
to smp_call_function() and further to smp_call_function_on_cpu(),
causing this latter to spin until the cross-CPU invocation of
rcu_barrier_func() has completed. This by itself would prevent
- a grace period from completing on non-CONFIG_PREEMPT kernels,
+ a grace period from completing on non-CONFIG_PREEMPTION kernels,
since each CPU must undergo a context switch (or other quiescent
state) before the grace period can complete. However, this is
- of no use in CONFIG_PREEMPT kernels.
+ of no use in CONFIG_PREEMPTION kernels.
Therefore, on_each_cpu() disables preemption across its call
to smp_call_function() and also across the local call to
diff --git a/Documentation/RCU/stallwarn.rst b/Documentation/RCU/stallwarn.rst
index c9ab6af4d3be9..7148e9be08c34 100644
--- a/Documentation/RCU/stallwarn.rst
+++ b/Documentation/RCU/stallwarn.rst
@@ -25,7 +25,7 @@ warnings:
- A CPU looping with bottom halves disabled.
-- For !CONFIG_PREEMPT kernels, a CPU looping anywhere in the kernel
+- For !CONFIG_PREEMPTION kernels, a CPU looping anywhere in the kernel
without invoking schedule(). If the looping in the kernel is
really expected and desirable behavior, you might need to add
some calls to cond_resched().
@@ -44,7 +44,7 @@ warnings:
result in the ``rcu_.*kthread starved for`` console-log message,
which will include additional debugging information.
-- A CPU-bound real-time task in a CONFIG_PREEMPT kernel, which might
+- A CPU-bound real-time task in a CONFIG_PREEMPTION kernel, which might
happen to preempt a low-priority task in the middle of an RCU
read-side critical section. This is especially damaging if
that low-priority task is not permitted to run on any other CPU,
@@ -92,7 +92,9 @@ warnings:
buggy timer hardware through bugs in the interrupt or exception
path (whether hardware, firmware, or software) through bugs
in Linux's timer subsystem through bugs in the scheduler, and,
- yes, even including bugs in RCU itself.
+ yes, even including bugs in RCU itself. It can also result in
+ the ``rcu_.*timer wakeup didn't happen for`` console-log message,
+ which will include additional debugging information.
- A bug in the RCU implementation.
@@ -292,6 +294,25 @@ kthread is waiting for a short timeout, the "state" precedes value of the
task_struct ->state field, and the "cpu" indicates that the grace-period
kthread last ran on CPU 5.
+If the relevant grace-period kthread does not wake from FQS wait in a
+reasonable time, then the following additional line is printed::
+
+ kthread timer wakeup didn't happen for 23804 jiffies! g7076 f0x0 RCU_GP_WAIT_FQS(5) ->state=0x402
+
+The "23804" indicates that kthread's timer expired more than 23 thousand
+jiffies ago. The rest of the line has meaning similar to the kthread
+starvation case.
+
+Additionally, the following line is printed::
+
+ Possible timer handling issue on cpu=4 timer-softirq=11142
+
+Here "cpu" indicates that the grace-period kthread last ran on CPU 4,
+where it queued the fqs timer. The number following the "timer-softirq"
+is the current ``TIMER_SOFTIRQ`` count on cpu 4. If this value does not
+change on successive RCU CPU stall warnings, there is further reason to
+suspect a timer problem.
+
Multiple Warnings From One Stall
================================
diff --git a/Documentation/RCU/whatisRCU.rst b/Documentation/RCU/whatisRCU.rst
index 1a4723f48bd9c..17e95ab2a2014 100644
--- a/Documentation/RCU/whatisRCU.rst
+++ b/Documentation/RCU/whatisRCU.rst
@@ -683,7 +683,7 @@ Quick Quiz #1:
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
This section presents a "toy" RCU implementation that is based on
"classic RCU". It is also short on performance (but only for updates) and
-on features such as hotplug CPU and the ability to run in CONFIG_PREEMPT
+on features such as hotplug CPU and the ability to run in CONFIG_PREEMPTION
kernels. The definitions of rcu_dereference() and rcu_assign_pointer()
are the same as those shown in the preceding section, so they are omitted.
::
@@ -739,7 +739,7 @@ Quick Quiz #2:
Quick Quiz #3:
If it is illegal to block in an RCU read-side
critical section, what the heck do you do in
- PREEMPT_RT, where normal spinlocks can block???
+ CONFIG_PREEMPT_RT, where normal spinlocks can block???
:ref:`Answers to Quick Quiz <8_whatisRCU>`
@@ -1093,7 +1093,7 @@ Quick Quiz #2:
overhead is **negative**.
Answer:
- Imagine a single-CPU system with a non-CONFIG_PREEMPT
+ Imagine a single-CPU system with a non-CONFIG_PREEMPTION
kernel where a routing table is used by process-context
code, but can be updated by irq-context code (for example,
by an "ICMP REDIRECT" packet). The usual way of handling
@@ -1120,10 +1120,10 @@ Answer:
Quick Quiz #3:
If it is illegal to block in an RCU read-side
critical section, what the heck do you do in
- PREEMPT_RT, where normal spinlocks can block???
+ CONFIG_PREEMPT_RT, where normal spinlocks can block???
Answer:
- Just as PREEMPT_RT permits preemption of spinlock
+ Just as CONFIG_PREEMPT_RT permits preemption of spinlock
critical sections, it permits preemption of RCU
read-side critical sections. It also permits
spinlocks blocking while in RCU read-side critical
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 8cbcd4f2a43be..f3112eff56529 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -808,13 +808,14 @@
insecure, please do not use on production kernels.
debug_locks_verbose=
- [KNL] verbose self-tests
- Format=<0|1>
+ [KNL] verbose locking self-tests
+ Format: <int>
Print debugging info while doing the locking API
self-tests.
- We default to 0 (no extra messages), setting it to
- 1 will print _a lot_ more information - normally
- only useful to kernel developers.
+ Bitmask for the various LOCKTYPE_ tests. Defaults to 0
+ (no extra messages), setting it to -1 (all bits set)
+ will print _a_lot_ more information - normally only
+ useful to lockdep developers.
debug_objects [KNL] Enable object debugging
@@ -950,12 +951,6 @@
causing system reset or hang due to sending
INIT from AP to BSP.
- perf_v4_pmi= [X86,INTEL]
- Format: <bool>
- Disable Intel PMU counter freezing feature.
- The feature only exists starting from
- Arch Perfmon v4 (Skylake and newer).
-
disable_ddw [PPC/PSERIES]
Disable Dynamic DMA Window support. Use this
to workaround buggy firmware.
@@ -3933,6 +3928,13 @@
Format: {"off"}
Disable Hardware Transactional Memory
+ preempt= [KNL]
+ Select preemption mode if you have CONFIG_PREEMPT_DYNAMIC
+ none - Limited to cond_resched() calls
+ voluntary - Limited to cond_resched() and might_sleep() calls
+ full - Any section that isn't explicitly preempt disabled
+ can be preempted anytime.
+
print-fatal-signals=
[KNL] debug: print fatal signals
@@ -4109,6 +4111,10 @@
value, meaning that RCU_SOFTIRQ is used by default.
Specify rcutree.use_softirq=0 to use rcuc kthreads.
+ But note that CONFIG_PREEMPT_RT=y kernels disable
+ this kernel boot parameter, forcibly setting it
+ to zero.
+
rcutree.rcu_fanout_exact= [KNL]
Disable autobalancing of the rcu_node combining
tree. This is used by rcutorture, and might
@@ -4196,12 +4202,6 @@
Set wakeup interval for idle CPUs that have
RCU callbacks (RCU_FAST_NO_HZ=y).
- rcutree.rcu_idle_lazy_gp_delay= [KNL]
- Set wakeup interval for idle CPUs that have
- only "lazy" RCU callbacks (RCU_FAST_NO_HZ=y).
- Lazy RCU callbacks are those which RCU can
- prove do nothing more than free memory.
-
rcutree.rcu_kick_kthreads= [KNL]
Cause the grace-period kthread to get an extra
wake_up() if it sleeps three times longer than
@@ -4355,6 +4355,14 @@
stress RCU, they don't participate in the actual
test, hence the "fake".
+ rcutorture.nocbs_nthreads= [KNL]
+ Set number of RCU callback-offload togglers.
+ Zero (the default) disables toggling.
+
+ rcutorture.nocbs_toggle= [KNL]
+ Set the delay in milliseconds between successive
+ callback-offload toggling attempts.
+
rcutorture.nreaders= [KNL]
Set number of RCU readers. The value -1 selects
N-1, where N is the number of CPUs. A value
@@ -4487,6 +4495,13 @@
only normal grace-period primitives. No effect
on CONFIG_TINY_RCU kernels.
+ But note that CONFIG_PREEMPT_RT=y kernels enables
+ this kernel boot parameter, forcibly setting
+ it to the value one, that is, converting any
+ post-boot attempt at an expedited RCU grace
+ period to instead use normal non-expedited
+ grace-period processing.
+
rcupdate.rcu_task_ipi_delay= [KNL]
Set time in jiffies during which RCU tasks will
avoid sending IPIs, starting with the beginning
@@ -4574,6 +4589,12 @@
refscale.verbose= [KNL]
Enable additional printk() statements.
+ refscale.verbose_batched= [KNL]
+ Batch the additional printk() statements. If zero
+ (the default) or negative, print everything. Otherwise,
+ print every Nth verbose statement, where N is the value
+ specified.
+
relax_domain_level=
[KNL, SMP] Set scheduler's default relax_domain_level.
See Documentation/admin-guide/cgroup-v1/cpusets.rst.
@@ -5348,6 +5369,14 @@
are running concurrently, especially on systems
with rotating-rust storage.
+ torture.verbose_sleep_frequency= [KNL]
+ Specifies how many verbose printk()s should be
+ emitted between each sleep. The default of zero
+ disables verbose-printk() sleeping.
+
+ torture.verbose_sleep_duration= [KNL]
+ Duration of each verbose-printk() sleep in jiffies.
+
tp720= [HW,PS2]
tpm_suspend_pcr=[HW,TPM]
diff --git a/Documentation/devicetree/bindings/timer/nuvoton,npcm7xx-timer.txt b/Documentation/devicetree/bindings/timer/nuvoton,npcm7xx-timer.txt
index ea22dfe485bee..97258f1a1505b 100644
--- a/Documentation/devicetree/bindings/timer/nuvoton,npcm7xx-timer.txt
+++ b/Documentation/devicetree/bindings/timer/nuvoton,npcm7xx-timer.txt
@@ -6,8 +6,7 @@ timer counters.
Required properties:
- compatible : "nuvoton,npcm750-timer" for Poleg NPCM750.
- reg : Offset and length of the register set for the device.
-- interrupts : Contain the timer interrupt with flags for
- falling edge.
+- interrupts : Contain the timer interrupt of timer 0.
- clocks : phandle of timer reference clock (usually a 25 MHz clock).
Example:
diff --git a/Documentation/devicetree/bindings/timer/stericsson-u300-apptimer.txt b/Documentation/devicetree/bindings/timer/stericsson-u300-apptimer.txt
deleted file mode 100644
index 9499bc8ee9e33..0000000000000
--- a/Documentation/devicetree/bindings/timer/stericsson-u300-apptimer.txt
+++ /dev/null
@@ -1,18 +0,0 @@
-ST-Ericsson U300 apptimer
-
-Required properties:
-
-- compatible : should be "stericsson,u300-apptimer"
-- reg : Specifies base physical address and size of the registers.
-- interrupts : A list of 4 interrupts; one for each subtimer. These
- are, in order: OS (operating system), DD (device driver) both
- adopted for EPOC/Symbian with two specific IRQs for these tasks,
- then GP1 and GP2, which are general-purpose timers.
-
-Example:
-
-timer {
- compatible = "stericsson,u300-apptimer";
- reg = <0xc0014000 0x1000>;
- interrupts = <24 25 26 27>;
-};
diff --git a/Documentation/scheduler/schedutil.txt b/Documentation/scheduler/schedutil.txt
new file mode 100644
index 0000000000000..78f6b91e22914
--- /dev/null
+++ b/Documentation/scheduler/schedutil.txt
@@ -0,0 +1,169 @@
+
+
+NOTE; all this assumes a linear relation between frequency and work capacity,
+we know this is flawed, but it is the best workable approximation.
+
+
+PELT (Per Entity Load Tracking)
+-------------------------------
+
+With PELT we track some metrics across the various scheduler entities, from
+individual tasks to task-group slices to CPU runqueues. As the basis for this
+we use an Exponentially Weighted Moving Average (EWMA), each period (1024us)
+is decayed such that y^32 = 0.5. That is, the most recent 32ms contribute
+half, while the rest of history contribute the other half.
+
+Specifically:
+
+ ewma_sum(u) := u_0 + u_1*y + u_2*y^2 + ...
+
+ ewma(u) = ewma_sum(u) / ewma_sum(1)
+
+Since this is essentially a progression of an infinite geometric series, the
+results are composable, that is ewma(A) + ewma(B) = ewma(A+B). This property
+is key, since it gives the ability to recompose the averages when tasks move
+around.
+
+Note that blocked tasks still contribute to the aggregates (task-group slices
+and CPU runqueues), which reflects their expected contribution when they
+resume running.
+
+Using this we track 2 key metrics: 'running' and 'runnable'. 'Running'
+reflects the time an entity spends on the CPU, while 'runnable' reflects the
+time an entity spends on the runqueue. When there is only a single task these
+two metrics are the same, but once there is contention for the CPU 'running'
+will decrease to reflect the fraction of time each task spends on the CPU
+while 'runnable' will increase to reflect the amount of contention.
+
+For more detail see: kernel/sched/pelt.c
+
+
+Frequency- / CPU Invariance
+---------------------------
+
+Because consuming the CPU for 50% at 1GHz is not the same as consuming the CPU
+for 50% at 2GHz, nor is running 50% on a LITTLE CPU the same as running 50% on
+a big CPU, we allow architectures to scale the time delta with two ratios, one
+Dynamic Voltage and Frequency Scaling (DVFS) ratio and one microarch ratio.
+
+For simple DVFS architectures (where software is in full control) we trivially
+compute the ratio as:
+
+ f_cur
+ r_dvfs := -----
+ f_max
+
+For more dynamic systems where the hardware is in control of DVFS we use
+hardware counters (Intel APERF/MPERF, ARMv8.4-AMU) to provide us this ratio.
+For Intel specifically, we use:
+
+ APERF
+ f_cur := ----- * P0
+ MPERF
+
+ 4C-turbo; if available and turbo enabled
+ f_max := { 1C-turbo; if turbo enabled
+ P0; otherwise
+
+ f_cur
+ r_dvfs := min( 1, ----- )
+ f_max
+
+We pick 4C turbo over 1C turbo to make it slightly more sustainable.
+
+r_cpu is determined as the ratio of highest performance level of the current
+CPU vs the highest performance level of any other CPU in the system.
+
+ r_tot = r_dvfs * r_cpu
+
+The result is that the above 'running' and 'runnable' metrics become invariant
+of DVFS and CPU type. IOW. we can transfer and compare them between CPUs.
+
+For more detail see:
+
+ - kernel/sched/pelt.h:update_rq_clock_pelt()
+ - arch/x86/kernel/smpboot.c:"APERF/MPERF frequency ratio computation."
+ - Documentation/scheduler/sched-capacity.rst:"1. CPU Capacity + 2. Task utilization"
+
+
+UTIL_EST / UTIL_EST_FASTUP
+--------------------------
+
+Because periodic tasks have their averages decayed while they sleep, even
+though when running their expected utilization will be the same, they suffer a
+(DVFS) ramp-up after they are running again.
+
+To alleviate this (a default enabled option) UTIL_EST drives an Infinite
+Impulse Response (IIR) EWMA with the 'running' value on dequeue -- when it is
+highest. A further default enabled option UTIL_EST_FASTUP modifies the IIR
+filter to instantly increase and only decay on decrease.
+
+A further runqueue wide sum (of runnable tasks) is maintained of:
+
+ util_est := \Sum_t max( t_running, t_util_est_ewma )
+
+For more detail see: kernel/sched/fair.c:util_est_dequeue()
+
+
+UCLAMP
+------
+
+It is possible to set effective u_min and u_max clamps on each CFS or RT task;
+the runqueue keeps an max aggregate of these clamps for all running tasks.
+
+For more detail see: include/uapi/linux/sched/types.h
+
+
+Schedutil / DVFS
+----------------
+
+Every time the scheduler load tracking is updated (task wakeup, task
+migration, time progression) we call out to schedutil to update the hardware
+DVFS state.
+
+The basis is the CPU runqueue's 'running' metric, which per the above it is
+the frequency invariant utilization estimate of the CPU. From this we compute
+a desired frequency like:
+
+ max( running, util_est ); if UTIL_EST
+ u_cfs := { running; otherwise
+
+ clamp( u_cfs + u_rt , u_min, u_max ); if UCLAMP_TASK
+ u_clamp := { u_cfs + u_rt; otherwise
+
+ u := u_clamp + u_irq + u_dl; [approx. see source for more detail]
+
+ f_des := min( f_max, 1.25 u * f_max )
+
+XXX IO-wait; when the update is due to a task wakeup from IO-completion we
+boost 'u' above.
+
+This frequency is then used to select a P-state/OPP or directly munged into a
+CPPC style request to the hardware.
+
+XXX: deadline tasks (Sporadic Task Model) allows us to calculate a hard f_min
+required to satisfy the workload.
+
+Because these callbacks are directly from the scheduler, the DVFS hardware
+interaction should be 'fast' and non-blocking. Schedutil supports
+rate-limiting DVFS requests for when hardware interaction is slow and
+expensive, this reduces effectiveness.
+
+For more information see: kernel/sched/cpufreq_schedutil.c
+
+
+NOTES
+-----
+
+ - On low-load scenarios, where DVFS is most relevant, the 'running' numbers
+ will closely reflect utilization.
+
+ - In saturated scenarios task movement will cause some transient dips,
+ suppose we have a CPU saturated with 4 tasks, then when we migrate a task
+ to an idle CPU, the old CPU will have a 'running' value of 0.75 while the
+ new CPU will gain 0.25. This is inevitable and time progression will
+ correct this. XXX do we still guarantee f_max due to no idle-time?
+
+ - Much of the above is about avoiding DVFS dips, and independent DVFS domains
+ having to re-learn / ramp-up when load shifts.
+
diff --git a/MAINTAINERS b/MAINTAINERS
index 2f45db9a59fb5..2ed6d8a94f8b9 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -9232,10 +9232,11 @@ F: include/linux/tboot.h
INTEL SGX
M: Jarkko Sakkinen <jarkko@kernel.org>
+R: Dave Hansen <dave.hansen@linux.intel.com>
L: linux-sgx@vger.kernel.org
S: Supported
Q: https://patchwork.kernel.org/project/intel-sgx/list/
-T: git git://git.kernel.org/pub/scm/linux/kernel/git/jarkko/linux-sgx.git
+T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git x86/sgx
F: Documentation/x86/sgx.rst
F: arch/x86/entry/vdso/vsgx.S
F: arch/x86/include/uapi/asm/sgx.h
@@ -10380,6 +10381,8 @@ LOCKING PRIMITIVES
M: Peter Zijlstra <peterz@infradead.org>
M: Ingo Molnar <mingo@redhat.com>
M: Will Deacon <will@kernel.org>
+R: Waiman Long <longman@redhat.com>
+R: Boqun Feng <boqun.feng@gmail.com> (LOCKDEP)
L: linux-kernel@vger.kernel.org
S: Maintained
T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git locking/core
diff --git a/arch/Kconfig b/arch/Kconfig
index eea2bc20d028b..2969e7c9cfa7d 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -763,6 +763,12 @@ config HAVE_IRQ_EXIT_ON_IRQ_STACK
This spares a stack switch and improves cache usage on softirq
processing.
+config HAVE_SOFTIRQ_ON_OWN_STACK
+ bool
+ help
+ Architecture provides a function to run __do_softirq() on a
+ seperate stack.
+
config PGTABLE_LEVELS
int
default 2
@@ -1094,6 +1100,15 @@ config HAVE_STATIC_CALL_INLINE
bool
depends on HAVE_STATIC_CALL
+config HAVE_PREEMPT_DYNAMIC
+ bool
+ depends on HAVE_STATIC_CALL
+ depends on GENERIC_ENTRY
+ help
+ Select this if the architecture support boot time preempt setting
+ on top of static calls. It is strongly advised to support inline
+ static call to avoid any overhead.
+
config ARCH_WANT_LD_ORPHAN_WARN
bool
help
diff --git a/arch/ia64/include/asm/efi.h b/arch/ia64/include/asm/efi.h
new file mode 100644
index 0000000000000..6a4a50d8f19a5
--- /dev/null
+++ b/arch/ia64/include/asm/efi.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_EFI_H
+#define _ASM_EFI_H
+
+typedef int (*efi_freemem_callback_t) (u64 start, u64 end, void *arg);
+
+void *efi_get_pal_addr(void);
+void efi_map_pal_code(void);
+void efi_memmap_walk(efi_freemem_callback_t, void *);
+void efi_memmap_walk_uc(efi_freemem_callback_t, void *);
+void efi_gettimeofday(struct timespec64 *ts);
+
+#endif
diff --git a/arch/ia64/include/asm/tlb.h b/arch/ia64/include/asm/tlb.h
index 8d9da6f08a62e..a15fe0809aaed 100644
--- a/arch/ia64/include/asm/tlb.h
+++ b/arch/ia64/include/asm/tlb.h
@@ -23,7 +23,7 @@
* unmapping a portion of the virtual address space, these hooks are called according to
* the following template:
*
- * tlb <- tlb_gather_mmu(mm, start, end); // start unmap for address space MM
+ * tlb <- tlb_gather_mmu(mm); // start unmap for address space MM
* {
* for each vma that needs a shootdown do {
* tlb_start_vma(tlb, vma);
@@ -36,7 +36,7 @@
* tlb_end_vma(tlb, vma);
* }
* }
- * tlb_finish_mmu(tlb, start, end); // finish unmap for address space MM
+ * tlb_finish_mmu(tlb); // finish unmap for address space MM
*/
#include <linux/mm.h>
#include <linux/pagemap.h>
diff --git a/arch/ia64/kernel/efi.c b/arch/ia64/kernel/efi.c
index b6bb718ed1ff7..c5fe21de46a81 100644
--- a/arch/ia64/kernel/efi.c
+++ b/arch/ia64/kernel/efi.c
@@ -34,6 +34,7 @@
#include <linux/kexec.h>
#include <linux/mm.h>
+#include <asm/efi.h>
#include <asm/io.h>
#include <asm/kregs.h>
#include <asm/meminit.h>
diff --git a/arch/ia64/kernel/machine_kexec.c b/arch/ia64/kernel/machine_kexec.c
index efc9b568401c8..af310dc8a356b 100644
--- a/arch/ia64/kernel/machine_kexec.c
+++ b/arch/ia64/kernel/machine_kexec.c
@@ -16,6 +16,7 @@
#include <linux/numa.h>
#include <linux/mmzone.h>
+#include <asm/efi.h>
#include <asm/numa.h>
#include <asm/mmu_context.h>
#include <asm/setup.h>
diff --git a/arch/ia64/kernel/mca.c b/arch/ia64/kernel/mca.c
index 3911c561d2bbf..d4cae2fc69ca3 100644
--- a/arch/ia64/kernel/mca.c
+++ b/arch/ia64/kernel/mca.c
@@ -91,6 +91,7 @@
#include <linux/gfp.h>
#include <asm/delay.h>
+#include <asm/efi.h>
#include <asm/meminit.h>
#include <asm/page.h>
#include <asm/ptrace.h>
diff --git a/arch/ia64/kernel/smpboot.c b/arch/ia64/kernel/smpboot.c
index 093040f7e626a..49b4885809399 100644
--- a/arch/ia64/kernel/smpboot.c
+++ b/arch/ia64/kernel/smpboot.c
@@ -45,6 +45,7 @@
#include <asm/cache.h>
#include <asm/current.h>
#include <asm/delay.h>
+#include <asm/efi.h>
#include <asm/io.h>
#include <asm/irq.h>
#include <asm/mca.h>
diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c
index 43e8050145bef..fa9c0ab8c6fc9 100644
--- a/arch/ia64/kernel/time.c
+++ b/arch/ia64/kernel/time.c
@@ -26,6 +26,7 @@
#include <linux/sched/cputime.h>
#include <asm/delay.h>
+#include <asm/efi.h>
#include <asm/hw_irq.h>
#include <asm/ptrace.h>
#include <asm/sal.h>
diff --git a/arch/ia64/kernel/uncached.c b/arch/ia64/kernel/uncached.c
index 0750f367837d2..51883a66aeb58 100644
--- a/arch/ia64/kernel/uncached.c
+++ b/arch/ia64/kernel/uncached.c
@@ -20,14 +20,12 @@
#include <linux/genalloc.h>
#include <linux/gfp.h>
#include <linux/pgtable.h>
+#include <asm/efi.h>
#include <asm/page.h>
#include <asm/pal.h>
#include <linux/atomic.h>
#include <asm/tlbflush.h>
-
-extern void __init efi_memmap_walk_uc(efi_freemem_callback_t, void *);
-
struct uncached_pool {
struct gen_pool *pool;
struct mutex add_chunk_mutex; /* serialize adding a converted chunk */
diff --git a/arch/ia64/mm/contig.c b/arch/ia64/mm/contig.c
index bfc4ecd0a2ab6..62fe80a16f426 100644
--- a/arch/ia64/mm/contig.c
+++ b/arch/ia64/mm/contig.c
@@ -21,6 +21,7 @@
#include <linux/swap.h>
#include <linux/sizes.h>
+#include <asm/efi.h>
#include <asm/meminit.h>
#include <asm/sections.h>
#include <asm/mca.h>
diff --git a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c
index c7311131156e8..03b3a02375ff3 100644
--- a/arch/ia64/mm/discontig.c
+++ b/arch/ia64/mm/discontig.c
@@ -24,6 +24,7 @@
#include <linux/efi.h>
#include <linux/nodemask.h>
#include <linux/slab.h>
+#include <asm/efi.h>
#include <asm/tlb.h>
#include <asm/meminit.h>
#include <asm/numa.h>
diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
index e76386a3479ea..b19f47a5a3051 100644
--- a/arch/ia64/mm/init.c
+++ b/arch/ia64/mm/init.c
@@ -27,6 +27,7 @@
#include <linux/swiotlb.h>
#include <asm/dma.h>
+#include <asm/efi.h>
#include <asm/io.h>
#include <asm/numa.h>
#include <asm/patch.h>
diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig
index 9d1037fe5da36..7bb6affd7179c 100644
--- a/arch/parisc/Kconfig
+++ b/arch/parisc/Kconfig
@@ -64,6 +64,7 @@ config PARISC
select HAVE_FTRACE_MCOUNT_RECORD if HAVE_DYNAMIC_FTRACE
select HAVE_KPROBES_ON_FTRACE
select HAVE_DYNAMIC_FTRACE_WITH_REGS
+ select HAVE_SOFTIRQ_ON_OWN_STACK if IRQSTACKS
select SET_FS
help
diff --git a/arch/parisc/include/asm/hardirq.h b/arch/parisc/include/asm/hardirq.h
index fad29aa6f45fa..1e4fbd0fd9448 100644
--- a/arch/parisc/include/asm/hardirq.h
+++ b/arch/parisc/include/asm/hardirq.h
@@ -12,10 +12,6 @@
#include <linux/threads.h>
#include <linux/irq.h>
-#ifdef CONFIG_IRQSTACKS
-#define __ARCH_HAS_DO_SOFTIRQ
-#endif
-
typedef struct {
unsigned int __softirq_pending;
unsigned int kernel_stack_usage;
diff --git a/arch/parisc/kernel/irq.c b/arch/parisc/kernel/irq.c
index 1dfb439b06928..0d46b19dc4d3d 100644
--- a/arch/parisc/kernel/irq.c
+++ b/arch/parisc/kernel/irq.c
@@ -17,6 +17,7 @@
#include <linux/types.h>
#include <asm/io.h>
+#include <asm/softirq_stack.h>
#include <asm/smp.h>
#include <asm/ldcw.h>
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 038ef3408830e..dc29b7e1f5aaf 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -236,6 +236,7 @@ config PPC
select MMU_GATHER_PAGE_SIZE
select HAVE_REGS_AND_STACK_ACCESS_API
select HAVE_RELIABLE_STACKTRACE if PPC_BOOK3S_64 && CPU_LITTLE_ENDIAN
+ select HAVE_SOFTIRQ_ON_OWN_STACK
select HAVE_SYSCALL_TRACEPOINTS
select HAVE_VIRT_CPU_ACCOUNTING
select HAVE_IRQ_TIME_ACCOUNTING
diff --git a/arch/powerpc/include/asm/irq.h b/arch/powerpc/include/asm/irq.h
index 4f983ca4030a4..f3f264e441a79 100644
--- a/arch/powerpc/include/asm/irq.h
+++ b/arch/powerpc/include/asm/irq.h
@@ -37,8 +37,6 @@ extern int distribute_irqs;
struct pt_regs;
-#define __ARCH_HAS_DO_SOFTIRQ
-
#if defined(CONFIG_BOOKE) || defined(CONFIG_40x)
/*
* Per-cpu stacks for handling critical, debug and machine check
diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index 086b0a7433290..d71fd10a1dd46 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -66,6 +66,7 @@
#include <asm/livepatch.h>
#include <asm/asm-prototypes.h>
#include <asm/hw_irq.h>
+#include <asm/softirq_stack.h>
#ifdef CONFIG_PPC64
#include <asm/paca.h>
diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c
index 4b4319d84c54d..6817331e22ffc 100644
--- a/arch/powerpc/perf/core-book3s.c
+++ b/arch/powerpc/perf/core-book3s.c
@@ -2208,7 +2208,7 @@ static void record_and_restart(struct perf_event *event, unsigned long val,
if (event->attr.sample_type & PERF_SAMPLE_WEIGHT &&
ppmu->get_mem_weight)
- ppmu->get_mem_weight(&data.weight);
+ ppmu->get_mem_weight(&data.weight.full);
if (perf_event_overflow(event, &data, regs))
power_pmu_stop(event, 0);
diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c
index f18d5067cd0fa..aeb7f3922106a 100644
--- a/arch/powerpc/platforms/cell/spufs/sched.c
+++ b/arch/powerpc/platforms/cell/spufs/sched.c
@@ -72,7 +72,7 @@ static struct timer_list spuloadavg_timer;
#define DEF_SPU_TIMESLICE (100 * HZ / (1000 * SPUSCHED_TICK))
#define SCALE_PRIO(x, prio) \
- max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_SPU_TIMESLICE)
+ max(x * (MAX_PRIO - prio) / (NICE_WIDTH / 2), MIN_SPU_TIMESLICE)
/*
* scale user-nice values [ -20 ... 0 ... 19 ] to time slice values:
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 030e391e053ca..e70510ecd34e8 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -185,6 +185,7 @@ config S390
select HAVE_REGS_AND_STACK_ACCESS_API
select HAVE_RELIABLE_STACKTRACE
select HAVE_RSEQ
+ select HAVE_SOFTIRQ_ON_OWN_STACK
select HAVE_SYSCALL_TRACEPOINTS
select HAVE_VIRT_CPU_ACCOUNTING
select HAVE_VIRT_CPU_ACCOUNTING_IDLE
diff --git a/arch/s390/include/asm/hardirq.h b/arch/s390/include/asm/hardirq.h
index dfbc3c6c06746..58668ffb54882 100644
--- a/arch/s390/include/asm/hardirq.h
+++ b/arch/s390/include/asm/hardirq.h
@@ -18,7 +18,6 @@
#define or_softirq_pending(x) (S390_lowcore.softirq_pending |= (x))
#define __ARCH_IRQ_STAT
-#define __ARCH_HAS_DO_SOFTIRQ
#define __ARCH_IRQ_EXIT_IRQS_DISABLED
static inline void ack_bad_irq(unsigned int irq)
diff --git a/arch/s390/kernel/irq.c b/arch/s390/kernel/irq.c
index c6d40bcf4a680..601c217913384 100644
--- a/arch/s390/kernel/irq.c
+++ b/arch/s390/kernel/irq.c
@@ -28,6 +28,7 @@
#include <asm/irq.h>
#include <asm/hw_irq.h>
#include <asm/stacktrace.h>
+#include <asm/softirq_stack.h>
#include "entry.h"
DEFINE_PER_CPU_SHARED_ALIGNED(struct irq_stat, irq_stat);
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index 52646f52f130f..bccd0da08128f 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -55,6 +55,7 @@ config SUPERH
select HAVE_PERF_EVENTS
select HAVE_REGS_AND_STACK_ACCESS_API
select HAVE_UID16
+ select HAVE_SOFTIRQ_ON_OWN_STACK if IRQSTACKS
select HAVE_STACKPROTECTOR
select HAVE_SYSCALL_TRACEPOINTS
select IRQ_FORCED_THREADING
diff --git a/arch/sh/include/asm/irq.h b/arch/sh/include/asm/irq.h
index 6d44c32ef0475..839551ce398c5 100644
--- a/arch/sh/include/asm/irq.h
+++ b/arch/sh/include/asm/irq.h
@@ -51,7 +51,6 @@ asmlinkage int do_IRQ(unsigned int irq, struct pt_regs *regs);
#ifdef CONFIG_IRQSTACKS
extern void irq_ctx_init(int cpu);
extern void irq_ctx_exit(int cpu);
-# define __ARCH_HAS_DO_SOFTIRQ
#else
# define irq_ctx_init(cpu) do { } while (0)
# define irq_ctx_exit(cpu) do { } while (0)
diff --git a/arch/sh/kernel/irq.c b/arch/sh/kernel/irq.c
index ab5f790b0cd27..ef0f0827cf575 100644
--- a/arch/sh/kernel/irq.c
+++ b/arch/sh/kernel/irq.c
@@ -20,6 +20,7 @@
#include <linux/uaccess.h>
#include <asm/thread_info.h>
#include <cpu/mmu_context.h>
+#include <asm/softirq_stack.h>
atomic_t irq_err_count;
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index e23260e184e49..daaf756af64e4 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -97,6 +97,7 @@ config SPARC64
select ARCH_HAS_PTE_SPECIAL
select PCI_DOMAINS if PCI
select ARCH_HAS_GIGANTIC_PAGE
+ select HAVE_SOFTIRQ_ON_OWN_STACK
config ARCH_PROC_KCORE_TEXT
def_bool y
diff --git a/arch/sparc/include/asm/irq_64.h b/arch/sparc/include/asm/irq_64.h
index 4d748e93b9742..154df2cf19f42 100644
--- a/arch/sparc/include/asm/irq_64.h
+++ b/arch/sparc/include/asm/irq_64.h
@@ -93,7 +93,6 @@ void arch_trigger_cpumask_backtrace(const struct cpumask *mask,
extern void *hardirq_stack[NR_CPUS];
extern void *softirq_stack[NR_CPUS];
-#define __ARCH_HAS_DO_SOFTIRQ
#define NO_IRQ 0xffffffff
diff --git a/arch/sparc/include/asm/tlb_64.h b/arch/sparc/include/asm/tlb_64.h
index e841cae544c2b..779a5a0f06080 100644
--- a/arch/sparc/include/asm/tlb_64.h
+++ b/arch/sparc/include/asm/tlb_64.h
@@ -24,7 +24,6 @@ void flush_tlb_pending(void);
#define tlb_start_vma(tlb, vma) do { } while (0)
#define tlb_end_vma(tlb, vma) do { } while (0)
-#define __tlb_remove_tlb_entry(tlb, ptep, address) do { } while (0)
#define tlb_flush(tlb) flush_tlb_pending()
/*
diff --git a/arch/sparc/kernel/irq_64.c b/arch/sparc/kernel/irq_64.c
index 3ec9f1402aad3..c8848bb681a11 100644
--- a/arch/sparc/kernel/irq_64.c
+++ b/arch/sparc/kernel/irq_64.c
@@ -42,6 +42,7 @@
#include <asm/head.h>
#include <asm/hypervisor.h>
#include <asm/cacheflush.h>
+#include <asm/softirq_stack.h>
#include "entry.h"
#include "cpumap.h"
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index ae73b8ae01a51..aa14a81f231f5 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -188,6 +188,7 @@ config X86
select HAVE_HW_BREAKPOINT
select HAVE_IDE
select HAVE_IOREMAP_PROT
+ select HAVE_IRQ_EXIT_ON_IRQ_STACK if X86_64
select HAVE_IRQ_TIME_ACCOUNTING
select HAVE_KERNEL_BZIP2
select HAVE_KERNEL_GZIP
@@ -221,10 +222,12 @@ config X86
select HAVE_REGS_AND_STACK_ACCESS_API
select HAVE_RELIABLE_STACKTRACE if X86_64 && (UNWINDER_FRAME_POINTER || UNWINDER_ORC) && STACK_VALIDATION
select HAVE_FUNCTION_ARG_ACCESS_API
+ select HAVE_SOFTIRQ_ON_OWN_STACK
select HAVE_STACKPROTECTOR if CC_HAS_SANE_STACKPROTECTOR
select HAVE_STACK_VALIDATION if X86_64
select HAVE_STATIC_CALL
select HAVE_STATIC_CALL_INLINE if HAVE_STACK_VALIDATION
+ select HAVE_PREEMPT_DYNAMIC
select HAVE_RSEQ
select HAVE_SYSCALL_TRACEPOINTS
select HAVE_UNSTABLE_SCHED_CLOCK
@@ -891,7 +894,7 @@ config HPET_TIMER
config HPET_EMULATE_RTC
def_bool y
- depends on HPET_TIMER && (RTC=y || RTC=m || RTC_DRV_CMOS=m || RTC_DRV_CMOS=y)
+ depends on HPET_TIMER && (RTC_DRV_CMOS=m || RTC_DRV_CMOS=y)
# Mark as expert because too many people got it wrong.
# The code disables itself when not needed.
@@ -1147,10 +1150,6 @@ config X86_MCE_INJECT
If you don't know what a machine check is and you don't do kernel
QA it is safe to say n.
-config X86_THERMAL_VECTOR
- def_bool y
- depends on X86_MCE_INTEL
-
source "arch/x86/events/Kconfig"
config X86_LEGACY_VM86
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 5857917f83eee..b9f58b8993b31 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -50,6 +50,9 @@ export BITS
KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow
KBUILD_CFLAGS += $(call cc-option,-mno-avx,)
+# Intel CET isn't enabled in the kernel
+KBUILD_CFLAGS += $(call cc-option,-fcf-protection=none)
+
ifeq ($(CONFIG_X86_32),y)
BITS := 32
UTS_MACHINE := i386
@@ -120,9 +123,6 @@ else
KBUILD_CFLAGS += -mno-red-zone
KBUILD_CFLAGS += -mcmodel=kernel
-
- # Intel CET isn't enabled in the kernel
- KBUILD_CFLAGS += $(call cc-option,-fcf-protection=none)
endif
ifdef CONFIG_X86_X32
@@ -295,16 +295,20 @@ archclean:
$(Q)$(MAKE) $(clean)=arch/x86/tools
define archhelp
- echo '* bzImage - Compressed kernel image (arch/x86/boot/bzImage)'
- echo ' install - Install kernel using'
- echo ' (your) ~/bin/$(INSTALLKERNEL) or'
- echo ' (distribution) /sbin/$(INSTALLKERNEL) or'
- echo ' install to $$(INSTALL_PATH) and run lilo'
- echo ' fdimage - Create 1.4MB boot floppy image (arch/x86/boot/fdimage)'
- echo ' fdimage144 - Create 1.4MB boot floppy image (arch/x86/boot/fdimage)'
- echo ' fdimage288 - Create 2.8MB boot floppy image (arch/x86/boot/fdimage)'
- echo ' isoimage - Create a boot CD-ROM image (arch/x86/boot/image.iso)'
- echo ' bzdisk/fdimage*/isoimage also accept:'
- echo ' FDARGS="..." arguments for the booted kernel'
- echo ' FDINITRD=file initrd for the booted kernel'
+ echo '* bzImage - Compressed kernel image (arch/x86/boot/bzImage)'
+ echo ' install - Install kernel using (your) ~/bin/$(INSTALLKERNEL) or'
+ echo ' (distribution) /sbin/$(INSTALLKERNEL) or install to '
+ echo ' $$(INSTALL_PATH) and run lilo'
+ echo ''
+ echo ' fdimage - Create 1.4MB boot floppy image (arch/x86/boot/fdimage)'
+ echo ' fdimage144 - Create 1.4MB boot floppy image (arch/x86/boot/fdimage)'
+ echo ' fdimage288 - Create 2.8MB boot floppy image (arch/x86/boot/fdimage)'
+ echo ' isoimage - Create a boot CD-ROM image (arch/x86/boot/image.iso)'
+ echo ' bzdisk/fdimage*/isoimage also accept:'
+ echo ' FDARGS="..." arguments for the booted kernel'
+ echo ' FDINITRD=file initrd for the booted kernel'
+ echo ''
+ echo ' kvm_guest.config - Enable Kconfig items for running this kernel as a KVM guest'
+ echo ' xen.config - Enable Kconfig items for running this kernel as a Xen guest'
+
endef
diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 0904f5676e4d8..a2433ae8a65e7 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -249,30 +249,23 @@ static __always_inline bool get_and_clear_inhcall(void) { return false; }
static __always_inline void restore_inhcall(bool inhcall) { }
#endif
-static void __xen_pv_evtchn_do_upcall(void)
+static void __xen_pv_evtchn_do_upcall(struct pt_regs *regs)
{
- irq_enter_rcu();
+ struct pt_regs *old_regs = set_irq_regs(regs);
+
inc_irq_stat(irq_hv_callback_count);
xen_hvm_evtchn_do_upcall();
- irq_exit_rcu();
+ set_irq_regs(old_regs);
}
__visible noinstr void xen_pv_evtchn_do_upcall(struct pt_regs *regs)
{
- struct pt_regs *old_regs;
+ irqentry_state_t state = irqentry_enter(regs);
bool inhcall;
- irqentry_state_t state;
- state = irqentry_enter(regs);
- old_regs = set_irq_regs(regs);
-
- instrumentation_begin();
- run_on_irqstack_cond(__xen_pv_evtchn_do_upcall, regs);
- instrumentation_begin();
-
- set_irq_regs(old_regs);
+ run_sysvec_on_irqstack_cond(__xen_pv_evtchn_do_upcall, regs);
inhcall = get_and_clear_inhcall();
if (inhcall && !WARN_ON_ONCE(state.exit_rcu)) {
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index cad08703c4ad7..400908dff42eb 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -46,14 +46,6 @@
.code64
.section .entry.text, "ax"
-#ifdef CONFIG_PARAVIRT_XXL
-SYM_CODE_START(native_usergs_sysret64)
- UNWIND_HINT_EMPTY
- swapgs
- sysretq
-SYM_CODE_END(native_usergs_sysret64)
-#endif /* CONFIG_PARAVIRT_XXL */
-
/*
* 64-bit SYSCALL instruction entry. Up to 6 arguments in registers.
*
@@ -123,7 +115,12 @@ SYM_INNER_LABEL(entry_SYSCALL_64_after_hwframe, SYM_L_GLOBAL)
* Try to use SYSRET instead of IRET if we're returning to
* a completely clean 64-bit userspace context. If we're not,
* go to the slow exit path.
+ * In the Xen PV case we must use iret anyway.
*/
+
+ ALTERNATIVE "", "jmp swapgs_restore_regs_and_return_to_usermode", \
+ X86_FEATURE_XENPV
+
movq RCX(%rsp), %rcx
movq RIP(%rsp), %r11
@@ -215,7 +212,8 @@ syscall_return_via_sysret:
popq %rdi
popq %rsp
- USERGS_SYSRET64
+ swapgs
+ sysretq
SYM_CODE_END(entry_SYSCALL_64)
/*
@@ -669,7 +667,7 @@ native_irq_return_ldt:
*/
pushq %rdi /* Stash user RDI */
- SWAPGS /* to kernel GS */
+ swapgs /* to kernel GS */
SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi /* to kernel CR3 */
movq PER_CPU_VAR(espfix_waddr), %rdi
@@ -699,7 +697,7 @@ native_irq_return_ldt:
orq PER_CPU_VAR(espfix_stack), %rax
SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
- SWAPGS /* to user GS */
+ swapgs /* to user GS */
popq %rdi /* Restore user RDI */
movq %rax, %rsp
@@ -756,47 +754,6 @@ SYM_CODE_START_LOCAL_NOALIGN(.Lbad_gs)
SYM_CODE_END(.Lbad_gs)
.previous
-/*
- * rdi: New stack pointer points to the top word of the stack
- * rsi: Function pointer
- * rdx: Function argument (can be NULL if none)
- */
-SYM_FUNC_START(asm_call_on_stack)
-SYM_INNER_LABEL(asm_call_sysvec_on_stack, SYM_L_GLOBAL)
-SYM_INNER_LABEL(asm_call_irq_on_stack, SYM_L_GLOBAL)
- /*
- * Save the frame pointer unconditionally. This allows the ORC
- * unwinder to handle the stack switch.
- */
- pushq %rbp
- mov %rsp, %rbp
-
- /*
- * The unwinder relies on the word at the top of the new stack
- * page linking back to the previous RSP.
- */
- mov %rsp, (%rdi)
- mov %rdi, %rsp
- /* Move the argument to the right place */
- mov %rdx, %rdi
-
-1:
- .pushsection .discard.instr_begin
- .long 1b - .
- .popsection
-
- CALL_NOSPEC rsi
-
-2:
- .pushsection .discard.instr_end
- .long 2b - .
- .popsection
-
- /* Restore the previous stack pointer from RBP. */
- leaveq
- ret
-SYM_FUNC_END(asm_call_on_stack)
-
#ifdef CONFIG_XEN_PV
/*
* A note on the "critical region" in our callback handler.
@@ -943,7 +900,7 @@ SYM_CODE_START_LOCAL(paranoid_entry)
ret
.Lparanoid_entry_swapgs:
- SWAPGS
+ swapgs
/*
* The above SAVE_AND_SWITCH_TO_KERNEL_CR3 macro doesn't do an
@@ -1001,7 +958,7 @@ SYM_CODE_START_LOCAL(paranoid_exit)
jnz restore_regs_and_return_to_kernel
/* We are returning to a context with user GSBASE */
- SWAPGS_UNSAFE_STACK
+ swapgs
jmp restore_regs_and_return_to_kernel
SYM_CODE_END(paranoid_exit)
@@ -1426,7 +1383,7 @@ nmi_no_fsgsbase:
jnz nmi_restore
nmi_swapgs:
- SWAPGS_UNSAFE_STACK
+ swapgs
nmi_restore:
POP_REGS
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index e37de298a4955..6ddeed3cd2acb 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -81,6 +81,8 @@ DEFINE_STATIC_CALL_NULL(x86_pmu_swap_task_ctx, *x86_pmu.swap_task_ctx);
DEFINE_STATIC_CALL_NULL(x86_pmu_drain_pebs, *x86_pmu.drain_pebs);
DEFINE_STATIC_CALL_NULL(x86_pmu_pebs_aliases, *x86_pmu.pebs_aliases);
+DEFINE_STATIC_CALL_NULL(x86_pmu_guest_get_msrs, *x86_pmu.guest_get_msrs);
+
u64 __read_mostly hw_cache_event_ids
[PERF_COUNT_HW_CACHE_MAX]
[PERF_COUNT_HW_CACHE_OP_MAX]
@@ -253,6 +255,8 @@ static bool check_hw_exists(void)
if (ret)
goto msr_fail;
for (i = 0; i < x86_pmu.num_counters_fixed; i++) {
+ if (fixed_counter_disabled(i))
+ continue;
if (val & (0x03 << i*4)) {
bios_fail = 1;
val_fail = val;
@@ -665,6 +669,12 @@ void x86_pmu_disable_all(void)
}
}
+struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr)
+{
+ return static_call(x86_pmu_guest_get_msrs)(nr);
+}
+EXPORT_SYMBOL_GPL(perf_guest_get_msrs);
+
/*
* There may be PMI landing after enabled=0. The PMI hitting could be before or
* after disable_all.
@@ -1523,6 +1533,8 @@ void perf_event_print_debug(void)
cpu, idx, prev_left);
}
for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
+ if (fixed_counter_disabled(idx))
+ continue;
rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count);
pr_info("CPU#%d: fixed-PMC%d count: %016llx\n",
@@ -1923,6 +1935,8 @@ static void x86_pmu_static_call_update(void)
static_call_update(x86_pmu_drain_pebs, x86_pmu.drain_pebs);
static_call_update(x86_pmu_pebs_aliases, x86_pmu.pebs_aliases);
+
+ static_call_update(x86_pmu_guest_get_msrs, x86_pmu.guest_get_msrs);
}
static void _x86_pmu_read(struct perf_event *event)
@@ -1930,6 +1944,13 @@ static void _x86_pmu_read(struct perf_event *event)
x86_perf_event_update(event);
}
+static inline struct perf_guest_switch_msr *
+perf_guest_get_msrs_nop(int *nr)
+{
+ *nr = 0;
+ return NULL;
+}
+
static int __init init_hw_perf_events(void)
{
struct x86_pmu_quirk *quirk;
@@ -1995,12 +2016,17 @@ static int __init init_hw_perf_events(void)
pr_info("... generic registers: %d\n", x86_pmu.num_counters);
pr_info("... value mask: %016Lx\n", x86_pmu.cntval_mask);
pr_info("... max period: %016Lx\n", x86_pmu.max_period);
- pr_info("... fixed-purpose events: %d\n", x86_pmu.num_counters_fixed);
+ pr_info("... fixed-purpose events: %lu\n",
+ hweight64((((1ULL << x86_pmu.num_counters_fixed) - 1)
+ << INTEL_PMC_IDX_FIXED) & x86_pmu.intel_ctrl));
pr_info("... event mask: %016Lx\n", x86_pmu.intel_ctrl);
if (!x86_pmu.read)
x86_pmu.read = _x86_pmu_read;
+ if (!x86_pmu.guest_get_msrs)
+ x86_pmu.guest_get_msrs = perf_guest_get_msrs_nop;
+
x86_pmu_static_call_update();
/*
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index d4569bfa83e30..5bac48d5c18e8 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -275,6 +275,55 @@ static struct extra_reg intel_icl_extra_regs[] __read_mostly = {
EVENT_EXTRA_END
};
+static struct extra_reg intel_spr_extra_regs[] __read_mostly = {
+ INTEL_UEVENT_EXTRA_REG(0x012a, MSR_OFFCORE_RSP_0, 0x3fffffffffull, RSP_0),
+ INTEL_UEVENT_EXTRA_REG(0x012b, MSR_OFFCORE_RSP_1, 0x3fffffffffull, RSP_1),
+ INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd),
+ INTEL_UEVENT_EXTRA_REG(0x01c6, MSR_PEBS_FRONTEND, 0x7fff17, FE),
+ EVENT_EXTRA_END
+};
+
+static struct event_constraint intel_spr_event_constraints[] = {
+ FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
+ FIXED_EVENT_CONSTRAINT(0x01c0, 0), /* INST_RETIRED.PREC_DIST */
+ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
+ FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
+ FIXED_EVENT_CONSTRAINT(0x0400, 3), /* SLOTS */
+ METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_RETIRING, 0),
+ METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_BAD_SPEC, 1),
+ METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_FE_BOUND, 2),
+ METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_BE_BOUND, 3),
+ METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_HEAVY_OPS, 4),
+ METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_BR_MISPREDICT, 5),
+ METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_FETCH_LAT, 6),
+ METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_MEM_BOUND, 7),
+
+ INTEL_EVENT_CONSTRAINT(0x2e, 0xff),
+ INTEL_EVENT_CONSTRAINT(0x3c, 0xff),
+ /*
+ * Generally event codes < 0x90 are restricted to counters 0-3.
+ * The 0x2E and 0x3C are exception, which has no restriction.
+ */
+ INTEL_EVENT_CONSTRAINT_RANGE(0x01, 0x8f, 0xf),
+
+ INTEL_UEVENT_CONSTRAINT(0x01a3, 0xf),
+ INTEL_UEVENT_CONSTRAINT(0x02a3, 0xf),
+ INTEL_UEVENT_CONSTRAINT(0x08a3, 0xf),
+ INTEL_UEVENT_CONSTRAINT(0x04a4, 0x1),
+ INTEL_UEVENT_CONSTRAINT(0x08a4, 0x1),
+ INTEL_UEVENT_CONSTRAINT(0x02cd, 0x1),
+ INTEL_EVENT_CONSTRAINT(0xce, 0x1),
+ INTEL_EVENT_CONSTRAINT_RANGE(0xd0, 0xdf, 0xf),
+ /*
+ * Generally event codes >= 0x90 are likely to have no restrictions.
+ * The exception are defined as above.
+ */
+ INTEL_EVENT_CONSTRAINT_RANGE(0x90, 0xfe, 0xff),
+
+ EVENT_CONSTRAINT_END
+};
+
+
EVENT_ATTR_STR(mem-loads, mem_ld_nhm, "event=0x0b,umask=0x10,ldlat=3");
EVENT_ATTR_STR(mem-loads, mem_ld_snb, "event=0xcd,umask=0x1,ldlat=3");
EVENT_ATTR_STR(mem-stores, mem_st_snb, "event=0xcd,umask=0x2");
@@ -314,11 +363,15 @@ EVENT_ATTR_STR_HT(topdown-recovery-bubbles, td_recovery_bubbles,
EVENT_ATTR_STR_HT(topdown-recovery-bubbles.scale, td_recovery_bubbles_scale,
"4", "2");
-EVENT_ATTR_STR(slots, slots, "event=0x00,umask=0x4");
-EVENT_ATTR_STR(topdown-retiring, td_retiring, "event=0x00,umask=0x80");
-EVENT_ATTR_STR(topdown-bad-spec, td_bad_spec, "event=0x00,umask=0x81");
-EVENT_ATTR_STR(topdown-fe-bound, td_fe_bound, "event=0x00,umask=0x82");
-EVENT_ATTR_STR(topdown-be-bound, td_be_bound, "event=0x00,umask=0x83");
+EVENT_ATTR_STR(slots, slots, "event=0x00,umask=0x4");
+EVENT_ATTR_STR(topdown-retiring, td_retiring, "event=0x00,umask=0x80");
+EVENT_ATTR_STR(topdown-bad-spec, td_bad_spec, "event=0x00,umask=0x81");
+EVENT_ATTR_STR(topdown-fe-bound, td_fe_bound, "event=0x00,umask=0x82");
+EVENT_ATTR_STR(topdown-be-bound, td_be_bound, "event=0x00,umask=0x83");
+EVENT_ATTR_STR(topdown-heavy-ops, td_heavy_ops, "event=0x00,umask=0x84");
+EVENT_ATTR_STR(topdown-br-mispredict, td_br_mispredict, "event=0x00,umask=0x85");
+EVENT_ATTR_STR(topdown-fetch-lat, td_fetch_lat, "event=0x00,umask=0x86");
+EVENT_ATTR_STR(topdown-mem-bound, td_mem_bound, "event=0x00,umask=0x87");
static struct attribute *snb_events_attrs[] = {
EVENT_PTR(td_slots_issued),
@@ -384,6 +437,108 @@ static u64 intel_pmu_event_map(int hw_event)
return intel_perfmon_event_map[hw_event];
}
+static __initconst const u64 spr_hw_cache_event_ids
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x81d0,
+ [ C(RESULT_MISS) ] = 0xe124,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x82d0,
+ },
+ },
+ [ C(L1I ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_MISS) ] = 0xe424,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+ [ C(LL ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x12a,
+ [ C(RESULT_MISS) ] = 0x12a,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x12a,
+ [ C(RESULT_MISS) ] = 0x12a,
+ },
+ },
+ [ C(DTLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x81d0,
+ [ C(RESULT_MISS) ] = 0xe12,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x82d0,
+ [ C(RESULT_MISS) ] = 0xe13,
+ },
+ },
+ [ C(ITLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = 0xe11,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+ [ C(BPU ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x4c4,
+ [ C(RESULT_MISS) ] = 0x4c5,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+ [ C(NODE) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x12a,
+ [ C(RESULT_MISS) ] = 0x12a,
+ },
+ },
+};
+
+static __initconst const u64 spr_hw_cache_extra_regs
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(LL ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x10001,
+ [ C(RESULT_MISS) ] = 0x3fbfc00001,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x3f3ffc0002,
+ [ C(RESULT_MISS) ] = 0x3f3fc00002,
+ },
+ },
+ [ C(NODE) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x10c000001,
+ [ C(RESULT_MISS) ] = 0x3fb3000001,
+ },
+ },
+};
+
/*
* Notes on the events:
* - data reads do not include code reads (comparable to earlier tables)
@@ -2134,18 +2289,6 @@ static void intel_tfa_pmu_enable_all(int added)
intel_pmu_enable_all(added);
}
-static void enable_counter_freeze(void)
-{
- update_debugctlmsr(get_debugctlmsr() |
- DEBUGCTLMSR_FREEZE_PERFMON_ON_PMI);
-}
-
-static void disable_counter_freeze(void)
-{
- update_debugctlmsr(get_debugctlmsr() &
- ~DEBUGCTLMSR_FREEZE_PERFMON_ON_PMI);
-}
-
static inline u64 intel_pmu_get_status(void)
{
u64 status;
@@ -2337,8 +2480,8 @@ static void __icl_update_topdown_event(struct perf_event *event,
}
}
-static void update_saved_topdown_regs(struct perf_event *event,
- u64 slots, u64 metrics)
+static void update_saved_topdown_regs(struct perf_event *event, u64 slots,
+ u64 metrics, int metric_end)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
struct perf_event *other;
@@ -2347,7 +2490,7 @@ static void update_saved_topdown_regs(struct perf_event *event,
event->hw.saved_slots = slots;
event->hw.saved_metric = metrics;
- for_each_set_bit(idx, cpuc->active_mask, INTEL_PMC_IDX_TD_BE_BOUND + 1) {
+ for_each_set_bit(idx, cpuc->active_mask, metric_end + 1) {
if (!is_topdown_idx(idx))
continue;
other = cpuc->events[idx];
@@ -2362,7 +2505,8 @@ static void update_saved_topdown_regs(struct perf_event *event,
* The PERF_METRICS and Fixed counter 3 are read separately. The values may be
* modify by a NMI. PMU has to be disabled before calling this function.
*/
-static u64 icl_update_topdown_event(struct perf_event *event)
+
+static u64 intel_update_topdown_event(struct perf_event *event, int metric_end)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
struct perf_event *other;
@@ -2378,7 +2522,7 @@ static u64 icl_update_topdown_event(struct perf_event *event)
/* read PERF_METRICS */
rdpmcl(INTEL_PMC_FIXED_RDPMC_METRICS, metrics);
- for_each_set_bit(idx, cpuc->active_mask, INTEL_PMC_IDX_TD_BE_BOUND + 1) {
+ for_each_set_bit(idx, cpuc->active_mask, metric_end + 1) {
if (!is_topdown_idx(idx))
continue;
other = cpuc->events[idx];
@@ -2404,7 +2548,7 @@ static u64 icl_update_topdown_event(struct perf_event *event)
* Don't need to reset the PERF_METRICS and Fixed counter 3.
* Because the values will be restored in next schedule in.
*/
- update_saved_topdown_regs(event, slots, metrics);
+ update_saved_topdown_regs(event, slots, metrics, metric_end);
reset = false;
}
@@ -2413,12 +2557,18 @@ static u64 icl_update_topdown_event(struct perf_event *event)
wrmsrl(MSR_CORE_PERF_FIXED_CTR3, 0);
wrmsrl(MSR_PERF_METRICS, 0);
if (event)
- update_saved_topdown_regs(event, 0, 0);
+ update_saved_topdown_regs(event, 0, 0, metric_end);
}
return slots;
}
+static u64 icl_update_topdown_event(struct perf_event *event)
+{
+ return intel_update_topdown_event(event, INTEL_PMC_IDX_METRIC_BASE +
+ x86_pmu.num_topdown_events - 1);
+}
+
static void intel_pmu_read_topdown_event(struct perf_event *event)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
@@ -2573,8 +2723,11 @@ static void intel_pmu_reset(void)
wrmsrl_safe(x86_pmu_config_addr(idx), 0ull);
wrmsrl_safe(x86_pmu_event_addr(idx), 0ull);
}
- for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++)
+ for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
+ if (fixed_counter_disabled(idx))
+ continue;
wrmsrl_safe(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull);
+ }
if (ds)
ds->bts_index = ds->bts_buffer_base;
@@ -2709,95 +2862,6 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status)
return handled;
}
-static bool disable_counter_freezing = true;
-static int __init intel_perf_counter_freezing_setup(char *s)
-{
- bool res;
-
- if (kstrtobool(s, &res))
- return -EINVAL;
-
- disable_counter_freezing = !res;
- return 1;
-}
-__setup("perf_v4_pmi=", intel_perf_counter_freezing_setup);
-
-/*
- * Simplified handler for Arch Perfmon v4:
- * - We rely on counter freezing/unfreezing to enable/disable the PMU.
- * This is done automatically on PMU ack.
- * - Ack the PMU only after the APIC.
- */
-
-static int intel_pmu_handle_irq_v4(struct pt_regs *regs)
-{
- struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
- int handled = 0;
- bool bts = false;
- u64 status;
- int pmu_enabled = cpuc->enabled;
- int loops = 0;
-
- /* PMU has been disabled because of counter freezing */
- cpuc->enabled = 0;
- if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask)) {
- bts = true;
- intel_bts_disable_local();
- handled = intel_pmu_drain_bts_buffer();
- handled += intel_bts_interrupt();
- }
- status = intel_pmu_get_status();
- if (!status)
- goto done;
-again:
- intel_pmu_lbr_read();
- if (++loops > 100) {
- static bool warned;
-
- if (!warned) {
- WARN(1, "perfevents: irq loop stuck!\n");
- perf_event_print_debug();
- warned = true;
- }
- intel_pmu_reset();
- goto done;
- }
-
-
- handled += handle_pmi_common(regs, status);
-done:
- /* Ack the PMI in the APIC */
- apic_write(APIC_LVTPC, APIC_DM_NMI);
-
- /*
- * The counters start counting immediately while ack the status.
- * Make it as close as possible to IRET. This avoids bogus
- * freezing on Skylake CPUs.
- */
- if (status) {
- intel_pmu_ack_status(status);
- } else {
- /*
- * CPU may issues two PMIs very close to each other.
- * When the PMI handler services the first one, the
- * GLOBAL_STATUS is already updated to reflect both.
- * When it IRETs, the second PMI is immediately
- * handled and it sees clear status. At the meantime,
- * there may be a third PMI, because the freezing bit
- * isn't set since the ack in first PMI handlers.
- * Double check if there is more work to be done.
- */
- status = intel_pmu_get_status();
- if (status)
- goto again;
- }
-
- if (bts)
- intel_bts_enable_local();
- cpuc->enabled = pmu_enabled;
- return handled;
-}
-
/*
* This handler is triggered by the local APIC, so the APIC IRQ handling
* rules apply:
@@ -3563,6 +3627,26 @@ static int core_pmu_hw_config(struct perf_event *event)
return intel_pmu_bts_config(event);
}
+#define INTEL_TD_METRIC_AVAILABLE_MAX (INTEL_TD_METRIC_RETIRING + \
+ ((x86_pmu.num_topdown_events - 1) << 8))
+
+static bool is_available_metric_event(struct perf_event *event)
+{
+ return is_metric_event(event) &&
+ event->attr.config <= INTEL_TD_METRIC_AVAILABLE_MAX;
+}
+
+static inline bool is_mem_loads_event(struct perf_event *event)
+{
+ return (event->attr.config & INTEL_ARCH_EVENT_MASK) == X86_CONFIG(.event=0xcd, .umask=0x01);
+}
+
+static inline bool is_mem_loads_aux_event(struct perf_event *event)
+{
+ return (event->attr.config & INTEL_ARCH_EVENT_MASK) == X86_CONFIG(.event=0x03, .umask=0x82);
+}
+
+
static int intel_pmu_hw_config(struct perf_event *event)
{
int ret = x86_pmu_hw_config(event);
@@ -3636,7 +3720,7 @@ static int intel_pmu_hw_config(struct perf_event *event)
if (event->attr.config & X86_ALL_EVENT_FLAGS)
return -EINVAL;
- if (is_metric_event(event)) {
+ if (is_available_metric_event(event)) {
struct perf_event *leader = event->group_leader;
/* The metric events don't support sampling. */
@@ -3665,6 +3749,33 @@ static int intel_pmu_hw_config(struct perf_event *event)
}
}
+ /*
+ * The load latency event X86_CONFIG(.event=0xcd, .umask=0x01) on SPR
+ * doesn't function quite right. As a work-around it needs to always be
+ * co-scheduled with a auxiliary event X86_CONFIG(.event=0x03, .umask=0x82).
+ * The actual count of this second event is irrelevant it just needs
+ * to be active to make the first event function correctly.
+ *
+ * In a group, the auxiliary event must be in front of the load latency
+ * event. The rule is to simplify the implementation of the check.
+ * That's because perf cannot have a complete group at the moment.
+ */
+ if (x86_pmu.flags & PMU_FL_MEM_LOADS_AUX &&
+ (event->attr.sample_type & PERF_SAMPLE_DATA_SRC) &&
+ is_mem_loads_event(event)) {
+ struct perf_event *leader = event->group_leader;
+ struct perf_event *sibling = NULL;
+
+ if (!is_mem_loads_aux_event(leader)) {
+ for_each_sibling_event(sibling, leader) {
+ if (is_mem_loads_aux_event(sibling))
+ break;
+ }
+ if (list_entry_is_head(sibling, &leader->sibling_list, sibling_list))
+ return -ENODATA;
+ }
+ }
+
if (!(event->attr.config & ARCH_PERFMON_EVENTSEL_ANY))
return 0;
@@ -3680,26 +3791,6 @@ static int intel_pmu_hw_config(struct perf_event *event)
return 0;
}
-#ifdef CONFIG_RETPOLINE
-static struct perf_guest_switch_msr *core_guest_get_msrs(int *nr);
-static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr);
-#endif
-
-struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr)
-{
-#ifdef CONFIG_RETPOLINE
- if (x86_pmu.guest_get_msrs == intel_guest_get_msrs)
- return intel_guest_get_msrs(nr);
- else if (x86_pmu.guest_get_msrs == core_guest_get_msrs)
- return core_guest_get_msrs(nr);
-#endif
- if (x86_pmu.guest_get_msrs)
- return x86_pmu.guest_get_msrs(nr);
- *nr = 0;
- return NULL;
-}
-EXPORT_SYMBOL_GPL(perf_guest_get_msrs);
-
static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
@@ -3865,6 +3956,29 @@ icl_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
}
static struct event_constraint *
+spr_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+ struct perf_event *event)
+{
+ struct event_constraint *c;
+
+ c = icl_get_event_constraints(cpuc, idx, event);
+
+ /*
+ * The :ppp indicates the Precise Distribution (PDist) facility, which
+ * is only supported on the GP counter 0. If a :ppp event which is not
+ * available on the GP counter 0, error out.
+ */
+ if (event->attr.precise_ip == 3) {
+ if (c->idxmsk64 & BIT_ULL(0))
+ return &counter0_constraint;
+
+ return &emptyconstraint;
+ }
+
+ return c;
+}
+
+static struct event_constraint *
glp_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
struct perf_event *event)
{
@@ -3953,6 +4067,14 @@ static u64 nhm_limit_period(struct perf_event *event, u64 left)
return max(left, 32ULL);
}
+static u64 spr_limit_period(struct perf_event *event, u64 left)
+{
+ if (event->attr.precise_ip == 3)
+ return max(left, 128ULL);
+
+ return left;
+}
+
PMU_FORMAT_ATTR(event, "config:0-7" );
PMU_FORMAT_ATTR(umask, "config:8-15" );
PMU_FORMAT_ATTR(edge, "config:18" );
@@ -4094,9 +4216,6 @@ static void intel_pmu_cpu_starting(int cpu)
if (x86_pmu.version > 1)
flip_smm_bit(&x86_pmu.attr_freeze_on_smi);
- if (x86_pmu.counter_freezing)
- enable_counter_freeze();
-
/* Disable perf metrics if any added CPU doesn't support it. */
if (x86_pmu.intel_cap.perf_metrics) {
union perf_capabilities perf_cap;
@@ -4167,9 +4286,6 @@ static void free_excl_cntrs(struct cpu_hw_events *cpuc)
static void intel_pmu_cpu_dying(int cpu)
{
fini_debug_store_on_cpu(cpu);
-
- if (x86_pmu.counter_freezing)
- disable_counter_freeze();
}
void intel_cpuc_finish(struct cpu_hw_events *cpuc)
@@ -4397,6 +4513,9 @@ static const struct x86_cpu_desc isolation_ucodes[] = {
INTEL_CPU_DESC(INTEL_FAM6_BROADWELL_X, 2, 0x0b000014),
INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE_X, 3, 0x00000021),
INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE_X, 4, 0x00000000),
+ INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE_X, 5, 0x00000000),
+ INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE_X, 6, 0x00000000),
+ INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE_X, 7, 0x00000000),
INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE_L, 3, 0x0000007c),
INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE, 3, 0x0000007c),
INTEL_CPU_DESC(INTEL_FAM6_KABYLAKE, 9, 0x0000004e),
@@ -4561,39 +4680,6 @@ static __init void intel_nehalem_quirk(void)
}
}
-static const struct x86_cpu_desc counter_freezing_ucodes[] = {
- INTEL_CPU_DESC(INTEL_FAM6_ATOM_GOLDMONT, 2, 0x0000000e),
- INTEL_CPU_DESC(INTEL_FAM6_ATOM_GOLDMONT, 9, 0x0000002e),
- INTEL_CPU_DESC(INTEL_FAM6_ATOM_GOLDMONT, 10, 0x00000008),
- INTEL_CPU_DESC(INTEL_FAM6_ATOM_GOLDMONT_D, 1, 0x00000028),
- INTEL_CPU_DESC(INTEL_FAM6_ATOM_GOLDMONT_PLUS, 1, 0x00000028),
- INTEL_CPU_DESC(INTEL_FAM6_ATOM_GOLDMONT_PLUS, 8, 0x00000006),
- {}
-};
-
-static bool intel_counter_freezing_broken(void)
-{
- return !x86_cpu_has_min_microcode_rev(counter_freezing_ucodes);
-}
-
-static __init void intel_counter_freezing_quirk(void)
-{
- /* Check if it's already disabled */
- if (disable_counter_freezing)
- return;
-
- /*
- * If the system starts with the wrong ucode, leave the
- * counter-freezing feature permanently disabled.
- */
- if (intel_counter_freezing_broken()) {
- pr_info("PMU counter freezing disabled due to CPU errata,"
- "please upgrade microcode\n");
- x86_pmu.counter_freezing = false;
- x86_pmu.handle_irq = intel_pmu_handle_irq;
- }
-}
-
/*
* enable software workaround for errata:
* SNB: BJ122
@@ -4703,6 +4789,42 @@ static struct attribute *icl_tsx_events_attrs[] = {
NULL,
};
+
+EVENT_ATTR_STR(mem-stores, mem_st_spr, "event=0xcd,umask=0x2");
+EVENT_ATTR_STR(mem-loads-aux, mem_ld_aux, "event=0x03,umask=0x82");
+
+static struct attribute *spr_events_attrs[] = {
+ EVENT_PTR(mem_ld_hsw),
+ EVENT_PTR(mem_st_spr),
+ EVENT_PTR(mem_ld_aux),
+ NULL,
+};
+
+static struct attribute *spr_td_events_attrs[] = {
+ EVENT_PTR(slots),
+ EVENT_PTR(td_retiring),
+ EVENT_PTR(td_bad_spec),
+ EVENT_PTR(td_fe_bound),
+ EVENT_PTR(td_be_bound),
+ EVENT_PTR(td_heavy_ops),
+ EVENT_PTR(td_br_mispredict),
+ EVENT_PTR(td_fetch_lat),
+ EVENT_PTR(td_mem_bound),
+ NULL,
+};
+
+static struct attribute *spr_tsx_events_attrs[] = {
+ EVENT_PTR(tx_start),
+ EVENT_PTR(tx_abort),
+ EVENT_PTR(tx_commit),
+ EVENT_PTR(tx_capacity_read),
+ EVENT_PTR(tx_capacity_write),
+ EVENT_PTR(tx_conflict),
+ EVENT_PTR(cycles_t),
+ EVENT_PTR(cycles_ct),
+ NULL,
+};
+
static ssize_t freeze_on_smi_show(struct device *cdev,
struct device_attribute *attr,
char *buf)
@@ -4926,7 +5048,7 @@ __init int intel_pmu_init(void)
union cpuid10_eax eax;
union cpuid10_ebx ebx;
struct event_constraint *c;
- unsigned int unused;
+ unsigned int fixed_mask;
struct extra_reg *er;
bool pmem = false;
int version, i;
@@ -4948,7 +5070,7 @@ __init int intel_pmu_init(void)
* Check whether the Architectural PerfMon supports
* Branch Misses Retired hw_event or not.
*/
- cpuid(10, &eax.full, &ebx.full, &unused, &edx.full);
+ cpuid(10, &eax.full, &ebx.full, &fixed_mask, &edx.full);
if (eax.split.mask_length < ARCH_PERFMON_EVENTS_COUNT)
return -ENODEV;
@@ -4972,15 +5094,15 @@ __init int intel_pmu_init(void)
* Quirk: v2 perfmon does not report fixed-purpose events, so
* assume at least 3 events, when not running in a hypervisor:
*/
- if (version > 1) {
+ if (version > 1 && version < 5) {
int assume = 3 * !boot_cpu_has(X86_FEATURE_HYPERVISOR);
x86_pmu.num_counters_fixed =
max((int)edx.split.num_counters_fixed, assume);
- }
- if (version >= 4)
- x86_pmu.counter_freezing = !disable_counter_freezing;
+ fixed_mask = (1L << x86_pmu.num_counters_fixed) - 1;
+ } else if (version >= 5)
+ x86_pmu.num_counters_fixed = fls(fixed_mask);
if (boot_cpu_has(X86_FEATURE_PDCM)) {
u64 capabilities;
@@ -5109,7 +5231,6 @@ __init int intel_pmu_init(void)
case INTEL_FAM6_ATOM_GOLDMONT:
case INTEL_FAM6_ATOM_GOLDMONT_D:
- x86_add_quirk(intel_counter_freezing_quirk);
memcpy(hw_cache_event_ids, glm_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
memcpy(hw_cache_extra_regs, glm_hw_cache_extra_regs,
@@ -5136,7 +5257,6 @@ __init int intel_pmu_init(void)
break;
case INTEL_FAM6_ATOM_GOLDMONT_PLUS:
- x86_add_quirk(intel_counter_freezing_quirk);
memcpy(hw_cache_event_ids, glp_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
memcpy(hw_cache_extra_regs, glp_hw_cache_extra_regs,
@@ -5483,12 +5603,50 @@ __init int intel_pmu_init(void)
x86_pmu.rtm_abort_event = X86_CONFIG(.event=0xc9, .umask=0x04);
x86_pmu.lbr_pt_coexist = true;
intel_pmu_pebs_data_source_skl(pmem);
+ x86_pmu.num_topdown_events = 4;
x86_pmu.update_topdown_event = icl_update_topdown_event;
x86_pmu.set_topdown_event_period = icl_set_topdown_event_period;
pr_cont("Icelake events, ");
name = "icelake";
break;
+ case INTEL_FAM6_SAPPHIRERAPIDS_X:
+ pmem = true;
+ x86_pmu.late_ack = true;
+ memcpy(hw_cache_event_ids, spr_hw_cache_event_ids, sizeof(hw_cache_event_ids));
+ memcpy(hw_cache_extra_regs, spr_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
+
+ x86_pmu.event_constraints = intel_spr_event_constraints;
+ x86_pmu.pebs_constraints = intel_spr_pebs_event_constraints;
+ x86_pmu.extra_regs = intel_spr_extra_regs;
+ x86_pmu.limit_period = spr_limit_period;
+ x86_pmu.pebs_aliases = NULL;
+ x86_pmu.pebs_prec_dist = true;
+ x86_pmu.pebs_block = true;
+ x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+ x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
+ x86_pmu.flags |= PMU_FL_PEBS_ALL;
+ x86_pmu.flags |= PMU_FL_INSTR_LATENCY;
+ x86_pmu.flags |= PMU_FL_MEM_LOADS_AUX;
+
+ x86_pmu.hw_config = hsw_hw_config;
+ x86_pmu.get_event_constraints = spr_get_event_constraints;
+ extra_attr = boot_cpu_has(X86_FEATURE_RTM) ?
+ hsw_format_attr : nhm_format_attr;
+ extra_skl_attr = skl_format_attr;
+ mem_attr = spr_events_attrs;
+ td_attr = spr_td_events_attrs;
+ tsx_attr = spr_tsx_events_attrs;
+ x86_pmu.rtm_abort_event = X86_CONFIG(.event=0xc9, .umask=0x04);
+ x86_pmu.lbr_pt_coexist = true;
+ intel_pmu_pebs_data_source_skl(pmem);
+ x86_pmu.num_topdown_events = 8;
+ x86_pmu.update_topdown_event = icl_update_topdown_event;
+ x86_pmu.set_topdown_event_period = icl_set_topdown_event_period;
+ pr_cont("Sapphire Rapids events, ");
+ name = "sapphire_rapids";
+ break;
+
default:
switch (x86_pmu.version) {
case 1:
@@ -5531,8 +5689,7 @@ __init int intel_pmu_init(void)
x86_pmu.num_counters_fixed = INTEL_PMC_MAX_FIXED;
}
- x86_pmu.intel_ctrl |=
- ((1LL << x86_pmu.num_counters_fixed)-1) << INTEL_PMC_IDX_FIXED;
+ x86_pmu.intel_ctrl |= (u64)fixed_mask << INTEL_PMC_IDX_FIXED;
/* AnyThread may be deprecated on arch perfmon v5 or later */
if (x86_pmu.intel_cap.anythread_deprecated)
@@ -5549,13 +5706,22 @@ __init int intel_pmu_init(void)
* events to the generic counters.
*/
if (c->idxmsk64 & INTEL_PMC_MSK_TOPDOWN) {
+ /*
+ * Disable topdown slots and metrics events,
+ * if slots event is not in CPUID.
+ */
+ if (!(INTEL_PMC_MSK_FIXED_SLOTS & x86_pmu.intel_ctrl))
+ c->idxmsk64 = 0;
c->weight = hweight64(c->idxmsk64);
continue;
}
- if (c->cmask == FIXED_EVENT_FLAGS
- && c->idxmsk64 != INTEL_PMC_MSK_FIXED_REF_CYCLES) {
- c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1;
+ if (c->cmask == FIXED_EVENT_FLAGS) {
+ /* Disabled fixed counters which are not in CPUID */
+ c->idxmsk64 &= x86_pmu.intel_ctrl;
+
+ if (c->idxmsk64 != INTEL_PMC_MSK_FIXED_REF_CYCLES)
+ c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1;
}
c->idxmsk64 &=
~(~0ULL << (INTEL_PMC_IDX_FIXED + x86_pmu.num_counters_fixed));
@@ -5601,13 +5767,6 @@ __init int intel_pmu_init(void)
pr_cont("full-width counters, ");
}
- /*
- * For arch perfmon 4 use counter freezing to avoid
- * several MSR accesses in the PMI.
- */
- if (x86_pmu.counter_freezing)
- x86_pmu.handle_irq = intel_pmu_handle_irq_v4;
-
if (x86_pmu.intel_cap.perf_metrics)
x86_pmu.intel_ctrl |= 1ULL << GLOBAL_CTRL_EN_PERF_METRICS;
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index 67dbc91bccfee..7ebae18264033 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -36,7 +36,9 @@ union intel_x86_pebs_dse {
unsigned int ld_dse:4;
unsigned int ld_stlb_miss:1;
unsigned int ld_locked:1;
- unsigned int ld_reserved:26;
+ unsigned int ld_data_blk:1;
+ unsigned int ld_addr_blk:1;
+ unsigned int ld_reserved:24;
};
struct {
unsigned int st_l1d_hit:1;
@@ -45,6 +47,12 @@ union intel_x86_pebs_dse {
unsigned int st_locked:1;
unsigned int st_reserved2:26;
};
+ struct {
+ unsigned int st_lat_dse:4;
+ unsigned int st_lat_stlb_miss:1;
+ unsigned int st_lat_locked:1;
+ unsigned int ld_reserved3:26;
+ };
};
@@ -198,6 +206,63 @@ static u64 load_latency_data(u64 status)
if (dse.ld_locked)
val |= P(LOCK, LOCKED);
+ /*
+ * Ice Lake and earlier models do not support block infos.
+ */
+ if (!x86_pmu.pebs_block) {
+ val |= P(BLK, NA);
+ return val;
+ }
+ /*
+ * bit 6: load was blocked since its data could not be forwarded
+ * from a preceding store
+ */
+ if (dse.ld_data_blk)
+ val |= P(BLK, DATA);
+
+ /*
+ * bit 7: load was blocked due to potential address conflict with
+ * a preceding store
+ */
+ if (dse.ld_addr_blk)
+ val |= P(BLK, ADDR);
+
+ if (!dse.ld_data_blk && !dse.ld_addr_blk)
+ val |= P(BLK, NA);
+
+ return val;
+}
+
+static u64 store_latency_data(u64 status)
+{
+ union intel_x86_pebs_dse dse;
+ u64 val;
+
+ dse.val = status;
+
+ /*
+ * use the mapping table for bit 0-3
+ */
+ val = pebs_data_source[dse.st_lat_dse];
+
+ /*
+ * bit 4: TLB access
+ * 0 = did not miss 2nd level TLB
+ * 1 = missed 2nd level TLB
+ */
+ if (dse.st_lat_stlb_miss)
+ val |= P(TLB, MISS) | P(TLB, L2);
+ else
+ val |= P(TLB, HIT) | P(TLB, L1) | P(TLB, L2);
+
+ /*
+ * bit 5: locked prefix
+ */
+ if (dse.st_lat_locked)
+ val |= P(LOCK, LOCKED);
+
+ val |= P(BLK, NA);
+
return val;
}
@@ -870,6 +935,28 @@ struct event_constraint intel_icl_pebs_event_constraints[] = {
EVENT_CONSTRAINT_END
};
+struct event_constraint intel_spr_pebs_event_constraints[] = {
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0x1c0, 0x100000000ULL),
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0x0400, 0x800000000ULL),
+
+ INTEL_FLAGS_EVENT_CONSTRAINT(0xc0, 0xfe),
+ INTEL_PLD_CONSTRAINT(0x1cd, 0xfe),
+ INTEL_PSD_CONSTRAINT(0x2cd, 0x1),
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x1d0, 0xf),
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x2d0, 0xf),
+
+ INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD_RANGE(0xd1, 0xd4, 0xf),
+
+ INTEL_FLAGS_EVENT_CONSTRAINT(0xd0, 0xf),
+
+ /*
+ * Everything else is handled by PMU_FL_PEBS_ALL, because we
+ * need the full constraints from the main table.
+ */
+
+ EVENT_CONSTRAINT_END
+};
+
struct event_constraint *intel_pebs_constraints(struct perf_event *event)
{
struct event_constraint *c;
@@ -960,7 +1047,8 @@ static void adaptive_pebs_record_size_update(void)
}
#define PERF_PEBS_MEMINFO_TYPE (PERF_SAMPLE_ADDR | PERF_SAMPLE_DATA_SRC | \
- PERF_SAMPLE_PHYS_ADDR | PERF_SAMPLE_WEIGHT | \
+ PERF_SAMPLE_PHYS_ADDR | \
+ PERF_SAMPLE_WEIGHT_TYPE | \
PERF_SAMPLE_TRANSACTION | \
PERF_SAMPLE_DATA_PAGE_SIZE)
@@ -987,7 +1075,7 @@ static u64 pebs_update_adaptive_cfg(struct perf_event *event)
gprs = (sample_type & PERF_SAMPLE_REGS_INTR) &&
(attr->sample_regs_intr & PEBS_GP_REGS);
- tsx_weight = (sample_type & PERF_SAMPLE_WEIGHT) &&
+ tsx_weight = (sample_type & PERF_SAMPLE_WEIGHT_TYPE) &&
((attr->config & INTEL_ARCH_EVENT_MASK) ==
x86_pmu.rtm_abort_event);
@@ -1331,6 +1419,8 @@ static u64 get_data_src(struct perf_event *event, u64 aux)
if (fl & PERF_X86_EVENT_PEBS_LDLAT)
val = load_latency_data(aux);
+ else if (fl & PERF_X86_EVENT_PEBS_STLAT)
+ val = store_latency_data(aux);
else if (fst && (fl & PERF_X86_EVENT_PEBS_HSW_PREC))
val = precise_datala_hsw(event, aux);
else if (fst)
@@ -1369,8 +1459,8 @@ static void setup_pebs_fixed_sample_data(struct perf_event *event,
/*
* Use latency for weight (only avail with PEBS-LL)
*/
- if (fll && (sample_type & PERF_SAMPLE_WEIGHT))
- data->weight = pebs->lat;
+ if (fll && (sample_type & PERF_SAMPLE_WEIGHT_TYPE))
+ data->weight.full = pebs->lat;
/*
* data.data_src encodes the data source
@@ -1462,8 +1552,8 @@ static void setup_pebs_fixed_sample_data(struct perf_event *event,
if (x86_pmu.intel_cap.pebs_format >= 2) {
/* Only set the TSX weight when no memory weight. */
- if ((sample_type & PERF_SAMPLE_WEIGHT) && !fll)
- data->weight = intel_get_tsx_weight(pebs->tsx_tuning);
+ if ((sample_type & PERF_SAMPLE_WEIGHT_TYPE) && !fll)
+ data->weight.full = intel_get_tsx_weight(pebs->tsx_tuning);
if (sample_type & PERF_SAMPLE_TRANSACTION)
data->txn = intel_get_tsx_transaction(pebs->tsx_tuning,
@@ -1507,6 +1597,9 @@ static void adaptive_pebs_save_regs(struct pt_regs *regs,
#endif
}
+#define PEBS_LATENCY_MASK 0xffff
+#define PEBS_CACHE_LATENCY_OFFSET 32
+
/*
* With adaptive PEBS the layout depends on what fields are configured.
*/
@@ -1577,9 +1670,27 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event,
}
if (format_size & PEBS_DATACFG_MEMINFO) {
- if (sample_type & PERF_SAMPLE_WEIGHT)
- data->weight = meminfo->latency ?:
- intel_get_tsx_weight(meminfo->tsx_tuning);
+ if (sample_type & PERF_SAMPLE_WEIGHT_TYPE) {
+ u64 weight = meminfo->latency;
+
+ if (x86_pmu.flags & PMU_FL_INSTR_LATENCY) {
+ data->weight.var2_w = weight & PEBS_LATENCY_MASK;
+ weight >>= PEBS_CACHE_LATENCY_OFFSET;
+ }
+
+ /*
+ * Although meminfo::latency is defined as a u64,
+ * only the lower 32 bits include the valid data
+ * in practice on Ice Lake and earlier platforms.
+ */
+ if (sample_type & PERF_SAMPLE_WEIGHT) {
+ data->weight.full = weight ?:
+ intel_get_tsx_weight(meminfo->tsx_tuning);
+ } else {
+ data->weight.var1_dw = (u32)(weight & PEBS_LATENCY_MASK) ?:
+ intel_get_tsx_weight(meminfo->tsx_tuning);
+ }
+ }
if (sample_type & PERF_SAMPLE_DATA_SRC)
data->data_src.val = get_data_src(event, meminfo->aux);
diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index 357258f82dc85..33c8180d5a874 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -31,21 +31,21 @@ struct event_constraint uncore_constraint_empty =
MODULE_LICENSE("GPL");
-int uncore_pcibus_to_physid(struct pci_bus *bus)
+int uncore_pcibus_to_dieid(struct pci_bus *bus)
{
struct pci2phy_map *map;
- int phys_id = -1;
+ int die_id = -1;
raw_spin_lock(&pci2phy_map_lock);
list_for_each_entry(map, &pci2phy_map_head, list) {
if (map->segment == pci_domain_nr(bus)) {
- phys_id = map->pbus_to_physid[bus->number];
+ die_id = map->pbus_to_dieid[bus->number];
break;
}
}
raw_spin_unlock(&pci2phy_map_lock);
- return phys_id;
+ return die_id;
}
static void uncore_free_pcibus_map(void)
@@ -86,7 +86,7 @@ lookup:
alloc = NULL;
map->segment = segment;
for (i = 0; i < 256; i++)
- map->pbus_to_physid[i] = -1;
+ map->pbus_to_dieid[i] = -1;
list_add_tail(&map->list, &pci2phy_map_head);
end:
@@ -332,7 +332,6 @@ static struct intel_uncore_box *uncore_alloc_box(struct intel_uncore_type *type,
uncore_pmu_init_hrtimer(box);
box->cpu = -1;
- box->pci_phys_id = -1;
box->dieid = -1;
/* set default hrtimer timeout */
@@ -993,18 +992,11 @@ uncore_types_init(struct intel_uncore_type **types, bool setid)
/*
* Get the die information of a PCI device.
* @pdev: The PCI device.
- * @phys_id: The physical socket id which the device maps to.
* @die: The die id which the device maps to.
*/
-static int uncore_pci_get_dev_die_info(struct pci_dev *pdev,
- int *phys_id, int *die)
+static int uncore_pci_get_dev_die_info(struct pci_dev *pdev, int *die)
{
- *phys_id = uncore_pcibus_to_physid(pdev->bus);
- if (*phys_id < 0)
- return -ENODEV;
-
- *die = (topology_max_die_per_package() > 1) ? *phys_id :
- topology_phys_to_logical_pkg(*phys_id);
+ *die = uncore_pcibus_to_dieid(pdev->bus);
if (*die < 0)
return -EINVAL;
@@ -1046,13 +1038,12 @@ uncore_pci_find_dev_pmu(struct pci_dev *pdev, const struct pci_device_id *ids)
* @pdev: The PCI device.
* @type: The corresponding PMU type of the device.
* @pmu: The corresponding PMU of the device.
- * @phys_id: The physical socket id which the device maps to.
* @die: The die id which the device maps to.
*/
static int uncore_pci_pmu_register(struct pci_dev *pdev,
struct intel_uncore_type *type,
struct intel_uncore_pmu *pmu,
- int phys_id, int die)
+ int die)
{
struct intel_uncore_box *box;
int ret;
@@ -1070,7 +1061,6 @@ static int uncore_pci_pmu_register(struct pci_dev *pdev,
WARN_ON_ONCE(pmu->func_id != pdev->devfn);
atomic_inc(&box->refcnt);
- box->pci_phys_id = phys_id;
box->dieid = die;
box->pci_dev = pdev;
box->pmu = pmu;
@@ -1097,9 +1087,9 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id
{
struct intel_uncore_type *type;
struct intel_uncore_pmu *pmu = NULL;
- int phys_id, die, ret;
+ int die, ret;
- ret = uncore_pci_get_dev_die_info(pdev, &phys_id, &die);
+ ret = uncore_pci_get_dev_die_info(pdev, &die);
if (ret)
return ret;
@@ -1132,7 +1122,7 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id
pmu = &type->pmus[UNCORE_PCI_DEV_IDX(id->driver_data)];
}
- ret = uncore_pci_pmu_register(pdev, type, pmu, phys_id, die);
+ ret = uncore_pci_pmu_register(pdev, type, pmu, die);
pci_set_drvdata(pdev, pmu->boxes[die]);
@@ -1142,17 +1132,12 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id
/*
* Unregister the PMU of a PCI device
* @pmu: The corresponding PMU is unregistered.
- * @phys_id: The physical socket id which the device maps to.
* @die: The die id which the device maps to.
*/
-static void uncore_pci_pmu_unregister(struct intel_uncore_pmu *pmu,
- int phys_id, int die)
+static void uncore_pci_pmu_unregister(struct intel_uncore_pmu *pmu, int die)
{
struct intel_uncore_box *box = pmu->boxes[die];
- if (WARN_ON_ONCE(phys_id != box->pci_phys_id))
- return;
-
pmu->boxes[die] = NULL;
if (atomic_dec_return(&pmu->activeboxes) == 0)
uncore_pmu_unregister(pmu);
@@ -1164,9 +1149,9 @@ static void uncore_pci_remove(struct pci_dev *pdev)
{
struct intel_uncore_box *box;
struct intel_uncore_pmu *pmu;
- int i, phys_id, die;
+ int i, die;
- if (uncore_pci_get_dev_die_info(pdev, &phys_id, &die))
+ if (uncore_pci_get_dev_die_info(pdev, &die))
return;
box = pci_get_drvdata(pdev);
@@ -1185,7 +1170,7 @@ static void uncore_pci_remove(struct pci_dev *pdev)
pci_set_drvdata(pdev, NULL);
- uncore_pci_pmu_unregister(pmu, phys_id, die);
+ uncore_pci_pmu_unregister(pmu, die);
}
static int uncore_bus_notify(struct notifier_block *nb,
@@ -1194,7 +1179,7 @@ static int uncore_bus_notify(struct notifier_block *nb,
struct device *dev = data;
struct pci_dev *pdev = to_pci_dev(dev);
struct intel_uncore_pmu *pmu;
- int phys_id, die;
+ int die;
/* Unregister the PMU when the device is going to be deleted. */
if (action != BUS_NOTIFY_DEL_DEVICE)
@@ -1204,10 +1189,10 @@ static int uncore_bus_notify(struct notifier_block *nb,
if (!pmu)
return NOTIFY_DONE;
- if (uncore_pci_get_dev_die_info(pdev, &phys_id, &die))
+ if (uncore_pci_get_dev_die_info(pdev, &die))
return NOTIFY_DONE;
- uncore_pci_pmu_unregister(pmu, phys_id, die);
+ uncore_pci_pmu_unregister(pmu, die);
return NOTIFY_OK;
}
@@ -1224,7 +1209,7 @@ static void uncore_pci_sub_driver_init(void)
struct pci_dev *pci_sub_dev;
bool notify = false;
unsigned int devfn;
- int phys_id, die;
+ int die;
while (ids && ids->vendor) {
pci_sub_dev = NULL;
@@ -1244,12 +1229,11 @@ static void uncore_pci_sub_driver_init(void)
if (!pmu)
continue;
- if (uncore_pci_get_dev_die_info(pci_sub_dev,
- &phys_id, &die))
+ if (uncore_pci_get_dev_die_info(pci_sub_dev, &die))
continue;
if (!uncore_pci_pmu_register(pci_sub_dev, type, pmu,
- phys_id, die))
+ die))
notify = true;
}
ids++;
diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h
index 9efea154349d3..a3c6e1643ad23 100644
--- a/arch/x86/events/intel/uncore.h
+++ b/arch/x86/events/intel/uncore.h
@@ -124,7 +124,6 @@ struct intel_uncore_extra_reg {
};
struct intel_uncore_box {
- int pci_phys_id;
int dieid; /* Logical die ID */
int n_active; /* number of active events */
int n_events;
@@ -173,11 +172,11 @@ struct freerunning_counters {
struct pci2phy_map {
struct list_head list;
int segment;
- int pbus_to_physid[256];
+ int pbus_to_dieid[256];
};
struct pci2phy_map *__find_pci2phy_map(int segment);
-int uncore_pcibus_to_physid(struct pci_bus *bus);
+int uncore_pcibus_to_dieid(struct pci_bus *bus);
ssize_t uncore_event_show(struct device *dev,
struct device_attribute *attr, char *buf);
diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c
index 098f893e2e225..51271288499e7 100644
--- a/arch/x86/events/intel/uncore_snb.c
+++ b/arch/x86/events/intel/uncore_snb.c
@@ -657,7 +657,7 @@ int snb_pci2phy_map_init(int devid)
pci_dev_put(dev);
return -ENOMEM;
}
- map->pbus_to_physid[bus] = 0;
+ map->pbus_to_dieid[bus] = 0;
raw_spin_unlock(&pci2phy_map_lock);
pci_dev_put(dev);
diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c
index 7bdb1821215db..b79951d0707c2 100644
--- a/arch/x86/events/intel/uncore_snbep.c
+++ b/arch/x86/events/intel/uncore_snbep.c
@@ -1359,7 +1359,7 @@ static struct pci_driver snbep_uncore_pci_driver = {
static int snbep_pci2phy_map_init(int devid, int nodeid_loc, int idmap_loc, bool reverse)
{
struct pci_dev *ubox_dev = NULL;
- int i, bus, nodeid, segment;
+ int i, bus, nodeid, segment, die_id;
struct pci2phy_map *map;
int err = 0;
u32 config = 0;
@@ -1370,36 +1370,77 @@ static int snbep_pci2phy_map_init(int devid, int nodeid_loc, int idmap_loc, bool
if (!ubox_dev)
break;
bus = ubox_dev->bus->number;
- /* get the Node ID of the local register */
- err = pci_read_config_dword(ubox_dev, nodeid_loc, &config);
- if (err)
- break;
- nodeid = config & NODE_ID_MASK;
- /* get the Node ID mapping */
- err = pci_read_config_dword(ubox_dev, idmap_loc, &config);
- if (err)
- break;
+ /*
+ * The nodeid and idmap registers only contain enough
+ * information to handle 8 nodes. On systems with more
+ * than 8 nodes, we need to rely on NUMA information,
+ * filled in from BIOS supplied information, to determine
+ * the topology.
+ */
+ if (nr_node_ids <= 8) {
+ /* get the Node ID of the local register */
+ err = pci_read_config_dword(ubox_dev, nodeid_loc, &config);
+ if (err)
+ break;
+ nodeid = config & NODE_ID_MASK;
+ /* get the Node ID mapping */
+ err = pci_read_config_dword(ubox_dev, idmap_loc, &config);
+ if (err)
+ break;
- segment = pci_domain_nr(ubox_dev->bus);
- raw_spin_lock(&pci2phy_map_lock);
- map = __find_pci2phy_map(segment);
- if (!map) {
+ segment = pci_domain_nr(ubox_dev->bus);
+ raw_spin_lock(&pci2phy_map_lock);
+ map = __find_pci2phy_map(segment);
+ if (!map) {
+ raw_spin_unlock(&pci2phy_map_lock);
+ err = -ENOMEM;
+ break;
+ }
+
+ /*
+ * every three bits in the Node ID mapping register maps
+ * to a particular node.
+ */
+ for (i = 0; i < 8; i++) {
+ if (nodeid == ((config >> (3 * i)) & 0x7)) {
+ if (topology_max_die_per_package() > 1)
+ die_id = i;
+ else
+ die_id = topology_phys_to_logical_pkg(i);
+ map->pbus_to_dieid[bus] = die_id;
+ break;
+ }
+ }
raw_spin_unlock(&pci2phy_map_lock);
- err = -ENOMEM;
- break;
- }
+ } else {
+ int node = pcibus_to_node(ubox_dev->bus);
+ int cpu;
+
+ segment = pci_domain_nr(ubox_dev->bus);
+ raw_spin_lock(&pci2phy_map_lock);
+ map = __find_pci2phy_map(segment);
+ if (!map) {
+ raw_spin_unlock(&pci2phy_map_lock);
+ err = -ENOMEM;
+ break;
+ }
- /*
- * every three bits in the Node ID mapping register maps
- * to a particular node.
- */
- for (i = 0; i < 8; i++) {
- if (nodeid == ((config >> (3 * i)) & 0x7)) {
- map->pbus_to_physid[bus] = i;
+ die_id = -1;
+ for_each_cpu(cpu, cpumask_of_pcibus(ubox_dev->bus)) {
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
+
+ if (c->initialized && cpu_to_node(cpu) == node) {
+ map->pbus_to_dieid[bus] = die_id = c->logical_die_id;
+ break;
+ }
+ }
+ raw_spin_unlock(&pci2phy_map_lock);
+
+ if (WARN_ON_ONCE(die_id == -1)) {
+ err = -EINVAL;
break;
}
}
- raw_spin_unlock(&pci2phy_map_lock);
}
if (!err) {
@@ -1412,17 +1453,17 @@ static int snbep_pci2phy_map_init(int devid, int nodeid_loc, int idmap_loc, bool
i = -1;
if (reverse) {
for (bus = 255; bus >= 0; bus--) {
- if (map->pbus_to_physid[bus] >= 0)
- i = map->pbus_to_physid[bus];
+ if (map->pbus_to_dieid[bus] >= 0)
+ i = map->pbus_to_dieid[bus];
else
- map->pbus_to_physid[bus] = i;
+ map->pbus_to_dieid[bus] = i;
}
} else {
for (bus = 0; bus <= 255; bus++) {
- if (map->pbus_to_physid[bus] >= 0)
- i = map->pbus_to_physid[bus];
+ if (map->pbus_to_dieid[bus] >= 0)
+ i = map->pbus_to_dieid[bus];
else
- map->pbus_to_physid[bus] = i;
+ map->pbus_to_dieid[bus] = i;
}
}
}
@@ -4646,19 +4687,14 @@ int snr_uncore_pci_init(void)
static struct pci_dev *snr_uncore_get_mc_dev(int id)
{
struct pci_dev *mc_dev = NULL;
- int phys_id, pkg;
+ int pkg;
while (1) {
mc_dev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3451, mc_dev);
if (!mc_dev)
break;
- phys_id = uncore_pcibus_to_physid(mc_dev->bus);
- if (phys_id < 0)
- continue;
- pkg = topology_phys_to_logical_pkg(phys_id);
- if (pkg < 0)
- continue;
- else if (pkg == id)
+ pkg = uncore_pcibus_to_dieid(mc_dev->bus);
+ if (pkg == id)
break;
}
return mc_dev;
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 7895cf4c59a7d..53b2b5fc23bca 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -80,6 +80,7 @@ static inline bool constraint_match(struct event_constraint *c, u64 ecode)
#define PERF_X86_EVENT_PAIR 0x1000 /* Large Increment per Cycle */
#define PERF_X86_EVENT_LBR_SELECT 0x2000 /* Save/Restore MSR_LBR_SELECT */
#define PERF_X86_EVENT_TOPDOWN 0x4000 /* Count Topdown slots/metrics events */
+#define PERF_X86_EVENT_PEBS_STLAT 0x8000 /* st+stlat data address sampling */
static inline bool is_topdown_count(struct perf_event *event)
{
@@ -443,6 +444,10 @@ struct cpu_hw_events {
__EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LDLAT)
+#define INTEL_PSD_CONSTRAINT(c, n) \
+ __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
+ HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_STLAT)
+
#define INTEL_PST_CONSTRAINT(c, n) \
__EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_ST)
@@ -682,8 +687,7 @@ struct x86_pmu {
/* PMI handler bits */
unsigned int late_ack :1,
- enabled_ack :1,
- counter_freezing :1;
+ enabled_ack :1;
/*
* sysfs attrs
*/
@@ -724,7 +728,8 @@ struct x86_pmu {
pebs_broken :1,
pebs_prec_dist :1,
pebs_no_tlb :1,
- pebs_no_isolation :1;
+ pebs_no_isolation :1,
+ pebs_block :1;
int pebs_record_size;
int pebs_buffer_size;
int max_pebs_events;
@@ -776,6 +781,7 @@ struct x86_pmu {
/*
* Intel perf metrics
*/
+ int num_topdown_events;
u64 (*update_topdown_event)(struct perf_event *event);
int (*set_topdown_event_period)(struct perf_event *event);
@@ -871,6 +877,8 @@ do { \
#define PMU_FL_PEBS_ALL 0x10 /* all events are valid PEBS events */
#define PMU_FL_TFA 0x20 /* deal with TSX force abort */
#define PMU_FL_PAIR 0x40 /* merge counters for large incr. events */
+#define PMU_FL_INSTR_LATENCY 0x80 /* Support Instruction Latency in PEBS Memory Info Record */
+#define PMU_FL_MEM_LOADS_AUX 0x100 /* Require an auxiliary event for the complete memory info */
#define EVENT_VAR(_id) event_attr_##_id
#define EVENT_PTR(_id) &event_attr_##_id.attr.attr
@@ -1060,6 +1068,11 @@ ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr,
ssize_t events_ht_sysfs_show(struct device *dev, struct device_attribute *attr,
char *page);
+static inline bool fixed_counter_disabled(int i)
+{
+ return !(x86_pmu.intel_ctrl >> (i + INTEL_PMC_IDX_FIXED));
+}
+
#ifdef CONFIG_CPU_SUP_AMD
int amd_pmu_init(void);
@@ -1157,6 +1170,8 @@ extern struct event_constraint intel_skl_pebs_event_constraints[];
extern struct event_constraint intel_icl_pebs_event_constraints[];
+extern struct event_constraint intel_spr_pebs_event_constraints[];
+
struct event_constraint *intel_pebs_constraints(struct perf_event *event);
void intel_pmu_pebs_add(struct perf_event *event);
diff --git a/arch/x86/events/probe.c b/arch/x86/events/probe.c
index 136a1e847254e..600bf8d15c0c9 100644
--- a/arch/x86/events/probe.c
+++ b/arch/x86/events/probe.c
@@ -28,6 +28,7 @@ perf_msr_probe(struct perf_msr *msr, int cnt, bool zero, void *data)
for (bit = 0; bit < cnt; bit++) {
if (!msr[bit].no_check) {
struct attribute_group *grp = msr[bit].grp;
+ u64 mask;
/* skip entry with no group */
if (!grp)
@@ -44,8 +45,12 @@ perf_msr_probe(struct perf_msr *msr, int cnt, bool zero, void *data)
/* Virt sucks; you cannot tell if a R/O MSR is present :/ */
if (rdmsrl_safe(msr[bit].msr, &val))
continue;
+
+ mask = msr[bit].mask;
+ if (!mask)
+ mask = ~0ULL;
/* Disable zero counters if requested. */
- if (!zero && !val)
+ if (!zero && !(val & mask))
continue;
grp->is_visible = NULL;
diff --git a/arch/x86/events/probe.h b/arch/x86/events/probe.h
index 4c8e0afc5fb53..261b9bda24e36 100644
--- a/arch/x86/events/probe.h
+++ b/arch/x86/events/probe.h
@@ -4,10 +4,11 @@
#include <linux/sysfs.h>
struct perf_msr {
- u64 msr;
- struct attribute_group *grp;
+ u64 msr;
+ struct attribute_group *grp;
bool (*test)(int idx, void *data);
- bool no_check;
+ bool no_check;
+ u64 mask;
};
unsigned long
diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c
index 7dbbeaacd9956..f42a70496a246 100644
--- a/arch/x86/events/rapl.c
+++ b/arch/x86/events/rapl.c
@@ -454,16 +454,9 @@ static struct attribute *rapl_events_cores[] = {
NULL,
};
-static umode_t
-rapl_not_visible(struct kobject *kobj, struct attribute *attr, int i)
-{
- return 0;
-}
-
static struct attribute_group rapl_events_cores_group = {
.name = "events",
.attrs = rapl_events_cores,
- .is_visible = rapl_not_visible,
};
static struct attribute *rapl_events_pkg[] = {
@@ -476,7 +469,6 @@ static struct attribute *rapl_events_pkg[] = {
static struct attribute_group rapl_events_pkg_group = {
.name = "events",
.attrs = rapl_events_pkg,
- .is_visible = rapl_not_visible,
};
static struct attribute *rapl_events_ram[] = {
@@ -489,7 +481,6 @@ static struct attribute *rapl_events_ram[] = {
static struct attribute_group rapl_events_ram_group = {
.name = "events",
.attrs = rapl_events_ram,
- .is_visible = rapl_not_visible,
};
static struct attribute *rapl_events_gpu[] = {
@@ -502,7 +493,6 @@ static struct attribute *rapl_events_gpu[] = {
static struct attribute_group rapl_events_gpu_group = {
.name = "events",
.attrs = rapl_events_gpu,
- .is_visible = rapl_not_visible,
};
static struct attribute *rapl_events_psys[] = {
@@ -515,7 +505,6 @@ static struct attribute *rapl_events_psys[] = {
static struct attribute_group rapl_events_psys_group = {
.name = "events",
.attrs = rapl_events_psys,
- .is_visible = rapl_not_visible,
};
static bool test_msr(int idx, void *data)
@@ -523,12 +512,23 @@ static bool test_msr(int idx, void *data)
return test_bit(idx, (unsigned long *) data);
}
+/* Only lower 32bits of the MSR represents the energy counter */
+#define RAPL_MSR_MASK 0xFFFFFFFF
+
static struct perf_msr intel_rapl_msrs[] = {
- [PERF_RAPL_PP0] = { MSR_PP0_ENERGY_STATUS, &rapl_events_cores_group, test_msr },
- [PERF_RAPL_PKG] = { MSR_PKG_ENERGY_STATUS, &rapl_events_pkg_group, test_msr },
- [PERF_RAPL_RAM] = { MSR_DRAM_ENERGY_STATUS, &rapl_events_ram_group, test_msr },
- [PERF_RAPL_PP1] = { MSR_PP1_ENERGY_STATUS, &rapl_events_gpu_group, test_msr },
- [PERF_RAPL_PSYS] = { MSR_PLATFORM_ENERGY_STATUS, &rapl_events_psys_group, test_msr },
+ [PERF_RAPL_PP0] = { MSR_PP0_ENERGY_STATUS, &rapl_events_cores_group, test_msr, false, RAPL_MSR_MASK },
+ [PERF_RAPL_PKG] = { MSR_PKG_ENERGY_STATUS, &rapl_events_pkg_group, test_msr, false, RAPL_MSR_MASK },
+ [PERF_RAPL_RAM] = { MSR_DRAM_ENERGY_STATUS, &rapl_events_ram_group, test_msr, false, RAPL_MSR_MASK },
+ [PERF_RAPL_PP1] = { MSR_PP1_ENERGY_STATUS, &rapl_events_gpu_group, test_msr, false, RAPL_MSR_MASK },
+ [PERF_RAPL_PSYS] = { MSR_PLATFORM_ENERGY_STATUS, &rapl_events_psys_group, test_msr, false, RAPL_MSR_MASK },
+};
+
+static struct perf_msr intel_rapl_spr_msrs[] = {
+ [PERF_RAPL_PP0] = { MSR_PP0_ENERGY_STATUS, &rapl_events_cores_group, test_msr, false, RAPL_MSR_MASK },
+ [PERF_RAPL_PKG] = { MSR_PKG_ENERGY_STATUS, &rapl_events_pkg_group, test_msr, false, RAPL_MSR_MASK },
+ [PERF_RAPL_RAM] = { MSR_DRAM_ENERGY_STATUS, &rapl_events_ram_group, test_msr, false, RAPL_MSR_MASK },
+ [PERF_RAPL_PP1] = { MSR_PP1_ENERGY_STATUS, &rapl_events_gpu_group, test_msr, false, RAPL_MSR_MASK },
+ [PERF_RAPL_PSYS] = { MSR_PLATFORM_ENERGY_STATUS, &rapl_events_psys_group, test_msr, true, RAPL_MSR_MASK },
};
/*
@@ -761,7 +761,7 @@ static struct rapl_model model_spr = {
BIT(PERF_RAPL_PSYS),
.unit_quirk = RAPL_UNIT_QUIRK_INTEL_SPR,
.msr_power_unit = MSR_RAPL_POWER_UNIT,
- .rapl_msrs = intel_rapl_msrs,
+ .rapl_msrs = intel_rapl_spr_msrs,
};
static struct rapl_model model_amd_fam17h = {
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 59bf91c57aa85..1728d4ce5730b 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -30,6 +30,7 @@ enum cpuid_leafs
CPUID_7_ECX,
CPUID_8000_0007_EBX,
CPUID_7_EDX,
+ CPUID_8000_001F_EAX,
};
#ifdef CONFIG_X86_FEATURE_NAMES
@@ -88,8 +89,9 @@ extern const char * const x86_bug_flags[NBUGINTS*32];
CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 16, feature_bit) || \
CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 17, feature_bit) || \
CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 18, feature_bit) || \
+ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 19, feature_bit) || \
REQUIRED_MASK_CHECK || \
- BUILD_BUG_ON_ZERO(NCAPINTS != 19))
+ BUILD_BUG_ON_ZERO(NCAPINTS != 20))
#define DISABLED_MASK_BIT_SET(feature_bit) \
( CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 0, feature_bit) || \
@@ -111,8 +113,9 @@ extern const char * const x86_bug_flags[NBUGINTS*32];
CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 16, feature_bit) || \
CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 17, feature_bit) || \
CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 18, feature_bit) || \
+ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 19, feature_bit) || \
DISABLED_MASK_CHECK || \
- BUILD_BUG_ON_ZERO(NCAPINTS != 19))
+ BUILD_BUG_ON_ZERO(NCAPINTS != 20))
#define cpu_has(c, bit) \
(__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : \
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 84b887825f126..1feb6c089ba29 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -13,7 +13,7 @@
/*
* Defines x86 CPU feature bits
*/
-#define NCAPINTS 19 /* N 32-bit words worth of info */
+#define NCAPINTS 20 /* N 32-bit words worth of info */
#define NBUGINTS 1 /* N 32-bit bug flags */
/*
@@ -96,7 +96,7 @@
#define X86_FEATURE_SYSCALL32 ( 3*32+14) /* "" syscall in IA32 userspace */
#define X86_FEATURE_SYSENTER32 ( 3*32+15) /* "" sysenter in IA32 userspace */
#define X86_FEATURE_REP_GOOD ( 3*32+16) /* REP microcode works well */
-#define X86_FEATURE_SME_COHERENT ( 3*32+17) /* "" AMD hardware-enforced cache coherency */
+/* FREE! ( 3*32+17) */
#define X86_FEATURE_LFENCE_RDTSC ( 3*32+18) /* "" LFENCE synchronizes RDTSC */
#define X86_FEATURE_ACC_POWER ( 3*32+19) /* AMD Accumulated Power Mechanism */
#define X86_FEATURE_NOPL ( 3*32+20) /* The NOPL (0F 1F) instructions */
@@ -201,7 +201,7 @@
#define X86_FEATURE_INVPCID_SINGLE ( 7*32+ 7) /* Effectively INVPCID && CR4.PCIDE=1 */
#define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */
#define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */
-#define X86_FEATURE_SME ( 7*32+10) /* AMD Secure Memory Encryption */
+/* FREE! ( 7*32+10) */
#define X86_FEATURE_PTI ( 7*32+11) /* Kernel Page Table Isolation enabled */
#define X86_FEATURE_RETPOLINE ( 7*32+12) /* "" Generic Retpoline mitigation for Spectre variant 2 */
#define X86_FEATURE_RETPOLINE_AMD ( 7*32+13) /* "" AMD Retpoline mitigation for Spectre variant 2 */
@@ -211,7 +211,7 @@
#define X86_FEATURE_SSBD ( 7*32+17) /* Speculative Store Bypass Disable */
#define X86_FEATURE_MBA ( 7*32+18) /* Memory Bandwidth Allocation */
#define X86_FEATURE_RSB_CTXSW ( 7*32+19) /* "" Fill RSB on context switches */
-#define X86_FEATURE_SEV ( 7*32+20) /* AMD Secure Encrypted Virtualization */
+/* FREE! ( 7*32+20) */
#define X86_FEATURE_USE_IBPB ( 7*32+21) /* "" Indirect Branch Prediction Barrier enabled */
#define X86_FEATURE_USE_IBRS_FW ( 7*32+22) /* "" Use IBRS during runtime firmware calls */
#define X86_FEATURE_SPEC_STORE_BYPASS_DISABLE ( 7*32+23) /* "" Disable Speculative Store Bypass. */
@@ -236,8 +236,6 @@
#define X86_FEATURE_EPT_AD ( 8*32+17) /* Intel Extended Page Table access-dirty bit */
#define X86_FEATURE_VMCALL ( 8*32+18) /* "" Hypervisor supports the VMCALL instruction */
#define X86_FEATURE_VMW_VMMCALL ( 8*32+19) /* "" VMware prefers VMMCALL hypercall instruction */
-#define X86_FEATURE_SEV_ES ( 8*32+20) /* AMD Secure Encrypted Virtualization - Encrypted State */
-#define X86_FEATURE_VM_PAGE_FLUSH ( 8*32+21) /* "" VM Page Flush MSR is supported */
/* Intel-defined CPU features, CPUID level 0x00000007:0 (EBX), word 9 */
#define X86_FEATURE_FSGSBASE ( 9*32+ 0) /* RDFSBASE, WRFSBASE, RDGSBASE, WRGSBASE instructions*/
@@ -385,6 +383,13 @@
#define X86_FEATURE_CORE_CAPABILITIES (18*32+30) /* "" IA32_CORE_CAPABILITIES MSR */
#define X86_FEATURE_SPEC_CTRL_SSBD (18*32+31) /* "" Speculative Store Bypass Disable */
+/* AMD-defined memory encryption features, CPUID level 0x8000001f (EAX), word 19 */
+#define X86_FEATURE_SME (19*32+ 0) /* AMD Secure Memory Encryption */
+#define X86_FEATURE_SEV (19*32+ 1) /* AMD Secure Encrypted Virtualization */
+#define X86_FEATURE_VM_PAGE_FLUSH (19*32+ 2) /* "" VM Page Flush MSR is supported */
+#define X86_FEATURE_SEV_ES (19*32+ 3) /* AMD Secure Encrypted Virtualization - Encrypted State */
+#define X86_FEATURE_SME_COHERENT (19*32+10) /* "" AMD hardware-enforced cache coherency */
+
/*
* BUG word(s)
*/
diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h
index 7947cb1782daf..b7dd944dc8673 100644
--- a/arch/x86/include/asm/disabled-features.h
+++ b/arch/x86/include/asm/disabled-features.h
@@ -91,6 +91,7 @@
DISABLE_ENQCMD)
#define DISABLED_MASK17 0
#define DISABLED_MASK18 0
-#define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 19)
+#define DISABLED_MASK19 0
+#define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 20)
#endif /* _ASM_X86_DISABLED_FEATURES_H */
diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index c98f78330b092..4d0b126835b8a 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -12,6 +12,7 @@
#include <linux/pgtable.h>
extern unsigned long efi_fw_vendor, efi_config_table;
+extern unsigned long efi_mixed_mode_stack_pa;
/*
* We map the EFI regions needed for runtime services non-contiguously,
@@ -68,17 +69,33 @@ extern unsigned long efi_fw_vendor, efi_config_table;
#f " called with too many arguments (" #p ">" #n ")"); \
})
+static inline void efi_fpu_begin(void)
+{
+ /*
+ * The UEFI calling convention (UEFI spec 2.3.2 and 2.3.4) requires
+ * that FCW and MXCSR (64-bit) must be initialized prior to calling
+ * UEFI code. (Oddly the spec does not require that the FPU stack
+ * be empty.)
+ */
+ kernel_fpu_begin_mask(KFPU_387 | KFPU_MXCSR);
+}
+
+static inline void efi_fpu_end(void)
+{
+ kernel_fpu_end();
+}
+
#ifdef CONFIG_X86_32
#define arch_efi_call_virt_setup() \
({ \
- kernel_fpu_begin(); \
+ efi_fpu_begin(); \
firmware_restrict_branch_speculation_start(); \
})
#define arch_efi_call_virt_teardown() \
({ \
firmware_restrict_branch_speculation_end(); \
- kernel_fpu_end(); \
+ efi_fpu_end(); \
})
#define arch_efi_call_virt(p, f, args...) p->f(args)
@@ -94,22 +111,12 @@ extern asmlinkage u64 __efi_call(void *fp, ...);
__efi_call(__VA_ARGS__); \
})
-/*
- * struct efi_scratch - Scratch space used while switching to/from efi_mm
- * @phys_stack: stack used during EFI Mixed Mode
- * @prev_mm: store/restore stolen mm_struct while switching to/from efi_mm
- */
-struct efi_scratch {
- u64 phys_stack;
- struct mm_struct *prev_mm;
-} __packed;
-
#define arch_efi_call_virt_setup() \
({ \
efi_sync_low_kernel_mappings(); \
- kernel_fpu_begin(); \
+ efi_fpu_begin(); \
firmware_restrict_branch_speculation_start(); \
- efi_switch_mm(&efi_mm); \
+ efi_enter_mm(); \
})
#define arch_efi_call_virt(p, f, args...) \
@@ -117,9 +124,9 @@ struct efi_scratch {
#define arch_efi_call_virt_teardown() \
({ \
- efi_switch_mm(efi_scratch.prev_mm); \
+ efi_leave_mm(); \
firmware_restrict_branch_speculation_end(); \
- kernel_fpu_end(); \
+ efi_fpu_end(); \
})
#ifdef CONFIG_KASAN
@@ -136,7 +143,6 @@ struct efi_scratch {
#endif /* CONFIG_X86_32 */
-extern struct efi_scratch efi_scratch;
extern int __init efi_memblock_x86_reserve_range(void);
extern void __init efi_print_memmap(void);
extern void __init efi_map_region(efi_memory_desc_t *md);
@@ -149,10 +155,12 @@ extern void __init efi_dump_pagetable(void);
extern void __init efi_apply_memmap_quirks(void);
extern int __init efi_reuse_config(u64 tables, int nr_tables);
extern void efi_delete_dummy_variable(void);
-extern void efi_switch_mm(struct mm_struct *mm);
-extern void efi_recover_from_page_fault(unsigned long phys_addr);
+extern void efi_crash_gracefully_on_page_fault(unsigned long phys_addr);
extern void efi_free_boot_services(void);
+void efi_enter_mm(void);
+void efi_leave_mm(void);
+
/* kexec external ABI */
struct efi_setup_data {
u64 fw_vendor;
diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h
index 67a4f1cb2aac5..ed33a14188f66 100644
--- a/arch/x86/include/asm/fpu/api.h
+++ b/arch/x86/include/asm/fpu/api.h
@@ -32,7 +32,19 @@ extern void fpregs_mark_activate(void);
/* Code that is unaware of kernel_fpu_begin_mask() can use this */
static inline void kernel_fpu_begin(void)
{
+#ifdef CONFIG_X86_64
+ /*
+ * Any 64-bit code that uses 387 instructions must explicitly request
+ * KFPU_387.
+ */
+ kernel_fpu_begin_mask(KFPU_MXCSR);
+#else
+ /*
+ * 32-bit kernel code may use 387 operations as well as SSE2, etc,
+ * as long as it checks that the CPU has the required capability.
+ */
kernel_fpu_begin_mask(KFPU_387 | KFPU_MXCSR);
+#endif
}
/*
diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h
index f656aabd1545c..5eb3bdf36a419 100644
--- a/arch/x86/include/asm/idtentry.h
+++ b/arch/x86/include/asm/idtentry.h
@@ -187,23 +187,22 @@ __visible noinstr void func(struct pt_regs *regs, unsigned long error_code)
* has to be done in the function body if necessary.
*/
#define DEFINE_IDTENTRY_IRQ(func) \
-static __always_inline void __##func(struct pt_regs *regs, u8 vector); \
+static void __##func(struct pt_regs *regs, u32 vector); \
\
__visible noinstr void func(struct pt_regs *regs, \
unsigned long error_code) \
{ \
irqentry_state_t state = irqentry_enter(regs); \
+ u32 vector = (u32)(u8)error_code; \
\
instrumentation_begin(); \
- irq_enter_rcu(); \
kvm_set_cpu_l1tf_flush_l1d(); \
- __##func (regs, (u8)error_code); \
- irq_exit_rcu(); \
+ run_irq_on_irqstack_cond(__##func, regs, vector); \
instrumentation_end(); \
irqentry_exit(regs, state); \
} \
\
-static __always_inline void __##func(struct pt_regs *regs, u8 vector)
+static noinline void __##func(struct pt_regs *regs, u32 vector)
/**
* DECLARE_IDTENTRY_SYSVEC - Declare functions for system vector entry points
@@ -237,10 +236,8 @@ __visible noinstr void func(struct pt_regs *regs) \
irqentry_state_t state = irqentry_enter(regs); \
\
instrumentation_begin(); \
- irq_enter_rcu(); \
kvm_set_cpu_l1tf_flush_l1d(); \
run_sysvec_on_irqstack_cond(__##func, regs); \
- irq_exit_rcu(); \
instrumentation_end(); \
irqentry_exit(regs, state); \
} \
@@ -585,6 +582,9 @@ DECLARE_IDTENTRY_MCE(X86_TRAP_MC, exc_machine_check);
#else
DECLARE_IDTENTRY_RAW(X86_TRAP_MC, exc_machine_check);
#endif
+#ifdef CONFIG_XEN_PV
+DECLARE_IDTENTRY_RAW(X86_TRAP_MC, xenpv_exc_machine_check);
+#endif
#endif
/* NMI */
@@ -605,6 +605,9 @@ DECLARE_IDTENTRY_RAW(X86_TRAP_DB, xenpv_exc_debug);
/* #DF */
DECLARE_IDTENTRY_DF(X86_TRAP_DF, exc_double_fault);
+#ifdef CONFIG_XEN_PV
+DECLARE_IDTENTRY_RAW_ERRORCODE(X86_TRAP_DF, xenpv_exc_double_fault);
+#endif
/* #VC */
#ifdef CONFIG_AMD_MEM_ENCRYPT
diff --git a/arch/x86/include/asm/insn.h b/arch/x86/include/asm/insn.h
index a8c3d284fa46c..95a448fbb44c9 100644
--- a/arch/x86/include/asm/insn.h
+++ b/arch/x86/include/asm/insn.h
@@ -7,9 +7,12 @@
* Copyright (C) IBM Corporation, 2009
*/
+#include <asm/byteorder.h>
/* insn_attr_t is defined in inat.h */
#include <asm/inat.h>
+#if defined(__BYTE_ORDER) ? __BYTE_ORDER == __LITTLE_ENDIAN : defined(__LITTLE_ENDIAN)
+
struct insn_field {
union {
insn_value_t value;
@@ -20,6 +23,48 @@ struct insn_field {
unsigned char nbytes;
};
+static inline void insn_field_set(struct insn_field *p, insn_value_t v,
+ unsigned char n)
+{
+ p->value = v;
+ p->nbytes = n;
+}
+
+static inline void insn_set_byte(struct insn_field *p, unsigned char n,
+ insn_byte_t v)
+{
+ p->bytes[n] = v;
+}
+
+#else
+
+struct insn_field {
+ insn_value_t value;
+ union {
+ insn_value_t little;
+ insn_byte_t bytes[4];
+ };
+ /* !0 if we've run insn_get_xxx() for this field */
+ unsigned char got;
+ unsigned char nbytes;
+};
+
+static inline void insn_field_set(struct insn_field *p, insn_value_t v,
+ unsigned char n)
+{
+ p->value = v;
+ p->little = __cpu_to_le32(v);
+ p->nbytes = n;
+}
+
+static inline void insn_set_byte(struct insn_field *p, unsigned char n,
+ insn_byte_t v)
+{
+ p->bytes[n] = v;
+ p->value = __le32_to_cpu(p->little);
+}
+#endif
+
struct insn {
struct insn_field prefixes; /*
* Prefixes
diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h
index 528c8a71fe7f7..768aa234cbb4a 100644
--- a/arch/x86/include/asm/irq.h
+++ b/arch/x86/include/asm/irq.h
@@ -25,8 +25,6 @@ static inline int irq_canonicalize(int irq)
extern int irq_init_percpu_irqstack(unsigned int cpu);
-#define __ARCH_HAS_DO_SOFTIRQ
-
struct irq_desc;
extern void fixup_irqs(void);
@@ -40,8 +38,6 @@ extern void native_init_IRQ(void);
extern void __handle_irq(struct irq_desc *desc, struct pt_regs *regs);
-extern __visible void do_IRQ(struct pt_regs *regs, unsigned long vector);
-
extern void init_ISA_irqs(void);
extern void __init init_IRQ(void);
diff --git a/arch/x86/include/asm/irq_stack.h b/arch/x86/include/asm/irq_stack.h
index 775816965c6ae..9b2a0ff76c73e 100644
--- a/arch/x86/include/asm/irq_stack.h
+++ b/arch/x86/include/asm/irq_stack.h
@@ -7,100 +7,217 @@
#include <asm/processor.h>
#ifdef CONFIG_X86_64
-static __always_inline bool irqstack_active(void)
-{
- return __this_cpu_read(irq_count) != -1;
-}
-
-void asm_call_on_stack(void *sp, void (*func)(void), void *arg);
-void asm_call_sysvec_on_stack(void *sp, void (*func)(struct pt_regs *regs),
- struct pt_regs *regs);
-void asm_call_irq_on_stack(void *sp, void (*func)(struct irq_desc *desc),
- struct irq_desc *desc);
-static __always_inline void __run_on_irqstack(void (*func)(void))
-{
- void *tos = __this_cpu_read(hardirq_stack_ptr);
-
- __this_cpu_add(irq_count, 1);
- asm_call_on_stack(tos - 8, func, NULL);
- __this_cpu_sub(irq_count, 1);
+/*
+ * Macro to inline switching to an interrupt stack and invoking function
+ * calls from there. The following rules apply:
+ *
+ * - Ordering:
+ *
+ * 1. Write the stack pointer into the top most place of the irq
+ * stack. This ensures that the various unwinders can link back to the
+ * original stack.
+ *
+ * 2. Switch the stack pointer to the top of the irq stack.
+ *
+ * 3. Invoke whatever needs to be done (@asm_call argument)
+ *
+ * 4. Pop the original stack pointer from the top of the irq stack
+ * which brings it back to the original stack where it left off.
+ *
+ * - Function invocation:
+ *
+ * To allow flexible usage of the macro, the actual function code including
+ * the store of the arguments in the call ABI registers is handed in via
+ * the @asm_call argument.
+ *
+ * - Local variables:
+ *
+ * @tos:
+ * The @tos variable holds a pointer to the top of the irq stack and
+ * _must_ be allocated in a non-callee saved register as this is a
+ * restriction coming from objtool.
+ *
+ * Note, that (tos) is both in input and output constraints to ensure
+ * that the compiler does not assume that R11 is left untouched in
+ * case this macro is used in some place where the per cpu interrupt
+ * stack pointer is used again afterwards
+ *
+ * - Function arguments:
+ * The function argument(s), if any, have to be defined in register
+ * variables at the place where this is invoked. Storing the
+ * argument(s) in the proper register(s) is part of the @asm_call
+ *
+ * - Constraints:
+ *
+ * The constraints have to be done very carefully because the compiler
+ * does not know about the assembly call.
+ *
+ * output:
+ * As documented already above the @tos variable is required to be in
+ * the output constraints to make the compiler aware that R11 cannot be
+ * reused after the asm() statement.
+ *
+ * For builds with CONFIG_UNWIND_FRAME_POINTER ASM_CALL_CONSTRAINT is
+ * required as well as this prevents certain creative GCC variants from
+ * misplacing the ASM code.
+ *
+ * input:
+ * - func:
+ * Immediate, which tells the compiler that the function is referenced.
+ *
+ * - tos:
+ * Register. The actual register is defined by the variable declaration.
+ *
+ * - function arguments:
+ * The constraints are handed in via the 'argconstr' argument list. They
+ * describe the register arguments which are used in @asm_call.
+ *
+ * clobbers:
+ * Function calls can clobber anything except the callee-saved
+ * registers. Tell the compiler.
+ */
+#define call_on_irqstack(func, asm_call, argconstr...) \
+{ \
+ register void *tos asm("r11"); \
+ \
+ tos = ((void *)__this_cpu_read(hardirq_stack_ptr)); \
+ \
+ asm_inline volatile( \
+ "movq %%rsp, (%[tos]) \n" \
+ "movq %[tos], %%rsp \n" \
+ \
+ asm_call \
+ \
+ "popq %%rsp \n" \
+ \
+ : "+r" (tos), ASM_CALL_CONSTRAINT \
+ : [__func] "i" (func), [tos] "r" (tos) argconstr \
+ : "cc", "rax", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", \
+ "memory" \
+ ); \
}
-static __always_inline void
-__run_sysvec_on_irqstack(void (*func)(struct pt_regs *regs),
- struct pt_regs *regs)
-{
- void *tos = __this_cpu_read(hardirq_stack_ptr);
-
- __this_cpu_add(irq_count, 1);
- asm_call_sysvec_on_stack(tos - 8, func, regs);
- __this_cpu_sub(irq_count, 1);
+/* Macros to assert type correctness for run_*_on_irqstack macros */
+#define assert_function_type(func, proto) \
+ static_assert(__builtin_types_compatible_p(typeof(&func), proto))
+
+#define assert_arg_type(arg, proto) \
+ static_assert(__builtin_types_compatible_p(typeof(arg), proto))
+
+/*
+ * Macro to invoke system vector and device interrupt C handlers.
+ */
+#define call_on_irqstack_cond(func, regs, asm_call, constr, c_args...) \
+{ \
+ /* \
+ * User mode entry and interrupt on the irq stack do not \
+ * switch stacks. If from user mode the task stack is empty. \
+ */ \
+ if (user_mode(regs) || __this_cpu_read(hardirq_stack_inuse)) { \
+ irq_enter_rcu(); \
+ func(c_args); \
+ irq_exit_rcu(); \
+ } else { \
+ /* \
+ * Mark the irq stack inuse _before_ and unmark _after_ \
+ * switching stacks. Interrupts are disabled in both \
+ * places. Invoke the stack switch macro with the call \
+ * sequence which matches the above direct invocation. \
+ */ \
+ __this_cpu_write(hardirq_stack_inuse, true); \
+ call_on_irqstack(func, asm_call, constr); \
+ __this_cpu_write(hardirq_stack_inuse, false); \
+ } \
}
-static __always_inline void
-__run_irq_on_irqstack(void (*func)(struct irq_desc *desc),
- struct irq_desc *desc)
-{
- void *tos = __this_cpu_read(hardirq_stack_ptr);
-
- __this_cpu_add(irq_count, 1);
- asm_call_irq_on_stack(tos - 8, func, desc);
- __this_cpu_sub(irq_count, 1);
+/*
+ * Function call sequence for __call_on_irqstack() for system vectors.
+ *
+ * Note that irq_enter_rcu() and irq_exit_rcu() do not use the input
+ * mechanism because these functions are global and cannot be optimized out
+ * when compiling a particular source file which uses one of these macros.
+ *
+ * The argument (regs) does not need to be pushed or stashed in a callee
+ * saved register to be safe vs. the irq_enter_rcu() call because the
+ * clobbers already prevent the compiler from storing it in a callee
+ * clobbered register. As the compiler has to preserve @regs for the final
+ * call to idtentry_exit() anyway, it's likely that it does not cause extra
+ * effort for this asm magic.
+ */
+#define ASM_CALL_SYSVEC \
+ "call irq_enter_rcu \n" \
+ "movq %[arg1], %%rdi \n" \
+ "call %P[__func] \n" \
+ "call irq_exit_rcu \n"
+
+#define SYSVEC_CONSTRAINTS , [arg1] "r" (regs)
+
+#define run_sysvec_on_irqstack_cond(func, regs) \
+{ \
+ assert_function_type(func, void (*)(struct pt_regs *)); \
+ assert_arg_type(regs, struct pt_regs *); \
+ \
+ call_on_irqstack_cond(func, regs, ASM_CALL_SYSVEC, \
+ SYSVEC_CONSTRAINTS, regs); \
}
-#else /* CONFIG_X86_64 */
-static inline bool irqstack_active(void) { return false; }
-static inline void __run_on_irqstack(void (*func)(void)) { }
-static inline void __run_sysvec_on_irqstack(void (*func)(struct pt_regs *regs),
- struct pt_regs *regs) { }
-static inline void __run_irq_on_irqstack(void (*func)(struct irq_desc *desc),
- struct irq_desc *desc) { }
-#endif /* !CONFIG_X86_64 */
-
-static __always_inline bool irq_needs_irq_stack(struct pt_regs *regs)
-{
- if (IS_ENABLED(CONFIG_X86_32))
- return false;
- if (!regs)
- return !irqstack_active();
- return !user_mode(regs) && !irqstack_active();
+/*
+ * As in ASM_CALL_SYSVEC above the clobbers force the compiler to store
+ * @regs and @vector in callee saved registers.
+ */
+#define ASM_CALL_IRQ \
+ "call irq_enter_rcu \n" \
+ "movq %[arg1], %%rdi \n" \
+ "movl %[arg2], %%esi \n" \
+ "call %P[__func] \n" \
+ "call irq_exit_rcu \n"
+
+#define IRQ_CONSTRAINTS , [arg1] "r" (regs), [arg2] "r" (vector)
+
+#define run_irq_on_irqstack_cond(func, regs, vector) \
+{ \
+ assert_function_type(func, void (*)(struct pt_regs *, u32)); \
+ assert_arg_type(regs, struct pt_regs *); \
+ assert_arg_type(vector, u32); \
+ \
+ call_on_irqstack_cond(func, regs, ASM_CALL_IRQ, \
+ IRQ_CONSTRAINTS, regs, vector); \
}
-
-static __always_inline void run_on_irqstack_cond(void (*func)(void),
- struct pt_regs *regs)
-{
- lockdep_assert_irqs_disabled();
-
- if (irq_needs_irq_stack(regs))
- __run_on_irqstack(func);
- else
- func();
+#define ASM_CALL_SOFTIRQ \
+ "call %P[__func] \n"
+
+/*
+ * Macro to invoke __do_softirq on the irq stack. This is only called from
+ * task context when bottom halfs are about to be reenabled and soft
+ * interrupts are pending to be processed. The interrupt stack cannot be in
+ * use here.
+ */
+#define do_softirq_own_stack() \
+{ \
+ __this_cpu_write(hardirq_stack_inuse, true); \
+ call_on_irqstack(__do_softirq, ASM_CALL_SOFTIRQ); \
+ __this_cpu_write(hardirq_stack_inuse, false); \
}
-static __always_inline void
-run_sysvec_on_irqstack_cond(void (*func)(struct pt_regs *regs),
- struct pt_regs *regs)
-{
- lockdep_assert_irqs_disabled();
-
- if (irq_needs_irq_stack(regs))
- __run_sysvec_on_irqstack(func, regs);
- else
- func(regs);
+#else /* CONFIG_X86_64 */
+/* System vector handlers always run on the stack they interrupted. */
+#define run_sysvec_on_irqstack_cond(func, regs) \
+{ \
+ irq_enter_rcu(); \
+ func(regs); \
+ irq_exit_rcu(); \
}
-static __always_inline void
-run_irq_on_irqstack_cond(void (*func)(struct irq_desc *desc), struct irq_desc *desc,
- struct pt_regs *regs)
-{
- lockdep_assert_irqs_disabled();
-
- if (irq_needs_irq_stack(regs))
- __run_irq_on_irqstack(func, desc);
- else
- func(desc);
+/* Switches to the irq stack within func() */
+#define run_irq_on_irqstack_cond(func, regs, vector) \
+{ \
+ irq_enter_rcu(); \
+ func(regs, vector); \
+ irq_exit_rcu(); \
}
+#endif /* !CONFIG_X86_64 */
+
#endif
diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h
index 2dfc8d380dab1..144d70ea43936 100644
--- a/arch/x86/include/asm/irqflags.h
+++ b/arch/x86/include/asm/irqflags.h
@@ -35,15 +35,6 @@ extern __always_inline unsigned long native_save_fl(void)
return flags;
}
-extern inline void native_restore_fl(unsigned long flags);
-extern inline void native_restore_fl(unsigned long flags)
-{
- asm volatile("push %0 ; popf"
- : /* no output */
- :"g" (flags)
- :"memory", "cc");
-}
-
static __always_inline void native_irq_disable(void)
{
asm volatile("cli": : :"memory");
@@ -79,11 +70,6 @@ static __always_inline unsigned long arch_local_save_flags(void)
return native_save_fl();
}
-static __always_inline void arch_local_irq_restore(unsigned long flags)
-{
- native_restore_fl(flags);
-}
-
static __always_inline void arch_local_irq_disable(void)
{
native_irq_disable();
@@ -131,25 +117,7 @@ static __always_inline unsigned long arch_local_irq_save(void)
#define SAVE_FLAGS(x) pushfq; popq %rax
#endif
-#define SWAPGS swapgs
-/*
- * Currently paravirt can't handle swapgs nicely when we
- * don't have a stack we can rely on (such as a user space
- * stack). So we either find a way around these or just fault
- * and emulate if a guest tries to call swapgs directly.
- *
- * Either way, this is a good way to document that we don't
- * have a reliable stack. x86_64 only.
- */
-#define SWAPGS_UNSAFE_STACK swapgs
-
#define INTERRUPT_RETURN jmp native_iret
-#define USERGS_SYSRET64 \
- swapgs; \
- sysretq;
-#define USERGS_SYSRET32 \
- swapgs; \
- sysretl
#else
#define INTERRUPT_RETURN iret
@@ -170,6 +138,20 @@ static __always_inline int arch_irqs_disabled(void)
return arch_irqs_disabled_flags(flags);
}
+
+static __always_inline void arch_local_irq_restore(unsigned long flags)
+{
+ if (!arch_irqs_disabled_flags(flags))
+ arch_local_irq_enable();
+}
+#else
+#ifdef CONFIG_X86_64
+#ifdef CONFIG_XEN_PV
+#define SWAPGS ALTERNATIVE "swapgs", "", X86_FEATURE_XENPV
+#else
+#define SWAPGS swapgs
+#endif
+#endif
#endif /* !__ASSEMBLY__ */
#endif
diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h
index 991a7ad540c72..d20a3d6be36ec 100644
--- a/arch/x86/include/asm/kprobes.h
+++ b/arch/x86/include/asm/kprobes.h
@@ -58,14 +58,17 @@ struct arch_specific_insn {
/* copy of the original instruction */
kprobe_opcode_t *insn;
/*
- * boostable = false: This instruction type is not boostable.
- * boostable = true: This instruction has been boosted: we have
+ * boostable = 0: This instruction type is not boostable.
+ * boostable = 1: This instruction has been boosted: we have
* added a relative jump after the instruction copy in insn,
* so no single-step and fixup are needed (unless there's
* a post_handler).
*/
- bool boostable;
- bool if_modifier;
+ unsigned boostable:1;
+ unsigned if_modifier:1;
+ unsigned is_call:1;
+ unsigned is_pushf:1;
+ unsigned is_abs_ip:1;
/* Number of bytes of text poked */
int tp_len;
};
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 56cdeaac76a0e..ddfb3cad8dff2 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -289,28 +289,6 @@ extern void (*mce_threshold_vector)(void);
extern void (*deferred_error_int_vector)(void);
/*
- * Thermal handler
- */
-
-void intel_init_thermal(struct cpuinfo_x86 *c);
-
-/* Interrupt Handler for core thermal thresholds */
-extern int (*platform_thermal_notify)(__u64 msr_val);
-
-/* Interrupt Handler for package thermal thresholds */
-extern int (*platform_thermal_package_notify)(__u64 msr_val);
-
-/* Callback support of rate control, return true, if
- * callback has rate control */
-extern bool (*platform_thermal_package_rate_control)(void);
-
-#ifdef CONFIG_X86_THERMAL_VECTOR
-extern void mcheck_intel_therm_init(void);
-#else
-static inline void mcheck_intel_therm_init(void) { }
-#endif
-
-/*
* Used by APEI to report memory error via /dev/mcelog
*/
diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h
index 2b7cc5397f80d..ab45a220fac47 100644
--- a/arch/x86/include/asm/microcode.h
+++ b/arch/x86/include/asm/microcode.h
@@ -127,14 +127,12 @@ static inline unsigned int x86_cpuid_family(void)
}
#ifdef CONFIG_MICROCODE
-int __init microcode_init(void);
extern void __init load_ucode_bsp(void);
extern void load_ucode_ap(void);
void reload_early_microcode(void);
extern bool get_builtin_firmware(struct cpio_data *cd, const char *name);
extern bool initrd_gone;
#else
-static inline int __init microcode_init(void) { return 0; };
static inline void __init load_ucode_bsp(void) { }
static inline void load_ucode_ap(void) { }
static inline void reload_early_microcode(void) { }
diff --git a/arch/x86/include/asm/orc_types.h b/arch/x86/include/asm/orc_types.h
index fdbffec4cfdea..5a2baf28a1dcd 100644
--- a/arch/x86/include/asm/orc_types.h
+++ b/arch/x86/include/asm/orc_types.h
@@ -40,6 +40,8 @@
#define ORC_REG_MAX 15
#ifndef __ASSEMBLY__
+#include <asm/byteorder.h>
+
/*
* This struct is more or less a vastly simplified version of the DWARF Call
* Frame Information standard. It contains only the necessary parts of DWARF
@@ -51,10 +53,18 @@
struct orc_entry {
s16 sp_offset;
s16 bp_offset;
+#if defined(__LITTLE_ENDIAN_BITFIELD)
unsigned sp_reg:4;
unsigned bp_reg:4;
unsigned type:2;
unsigned end:1;
+#elif defined(__BIG_ENDIAN_BITFIELD)
+ unsigned bp_reg:4;
+ unsigned sp_reg:4;
+ unsigned unused:5;
+ unsigned end:1;
+ unsigned type:2;
+#endif
} __packed;
#endif /* __ASSEMBLY__ */
diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h
index 645bd1d0ee072..64297eabad634 100644
--- a/arch/x86/include/asm/page_64_types.h
+++ b/arch/x86/include/asm/page_64_types.h
@@ -66,7 +66,7 @@
* On Intel CPUs, if a SYSCALL instruction is at the highest canonical
* address, then that syscall will enter the kernel with a
* non-canonical return address, and SYSRET will explode dangerously.
- * We avoid this particular problem by preventing anything executable
+ * We avoid this particular problem by preventing anything
* from being mapped at the maximum canonical address.
*
* On AMD CPUs in the Ryzen family, there's a nasty bug in which the
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index f8dce11d2bc1a..4abf110e22438 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -648,11 +648,6 @@ static inline notrace unsigned long arch_local_save_flags(void)
return PVOP_CALLEE0(unsigned long, irq.save_fl);
}
-static inline notrace void arch_local_irq_restore(unsigned long f)
-{
- PVOP_VCALLEE1(irq.restore_fl, f);
-}
-
static inline notrace void arch_local_irq_disable(void)
{
PVOP_VCALLEE0(irq.irq_disable);
@@ -776,31 +771,6 @@ extern void default_banner(void);
#ifdef CONFIG_X86_64
#ifdef CONFIG_PARAVIRT_XXL
-/*
- * If swapgs is used while the userspace stack is still current,
- * there's no way to call a pvop. The PV replacement *must* be
- * inlined, or the swapgs instruction must be trapped and emulated.
- */
-#define SWAPGS_UNSAFE_STACK \
- PARA_SITE(PARA_PATCH(PV_CPU_swapgs), swapgs)
-
-/*
- * Note: swapgs is very special, and in practise is either going to be
- * implemented with a single "swapgs" instruction or something very
- * special. Either way, we don't need to save any registers for
- * it.
- */
-#define SWAPGS \
- PARA_SITE(PARA_PATCH(PV_CPU_swapgs), \
- ANNOTATE_RETPOLINE_SAFE; \
- call PARA_INDIRECT(pv_ops+PV_CPU_swapgs); \
- )
-
-#define USERGS_SYSRET64 \
- PARA_SITE(PARA_PATCH(PV_CPU_usergs_sysret64), \
- ANNOTATE_RETPOLINE_SAFE; \
- jmp PARA_INDIRECT(pv_ops+PV_CPU_usergs_sysret64);)
-
#ifdef CONFIG_DEBUG_ENTRY
#define SAVE_FLAGS(clobbers) \
PARA_SITE(PARA_PATCH(PV_IRQ_save_fl), \
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index b6b02b7c19cc9..de87087d3bde1 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -156,20 +156,10 @@ struct pv_cpu_ops {
u64 (*read_pmc)(int counter);
- /*
- * Switch to usermode gs and return to 64-bit usermode using
- * sysret. Only used in 64-bit kernels to return to 64-bit
- * processes. Usermode register state, including %rsp, must
- * already be restored.
- */
- void (*usergs_sysret64)(void);
-
/* Normal iret. Jump to this with the standard iret stack
frame set up. */
void (*iret)(void);
- void (*swapgs)(void);
-
void (*start_context_switch)(struct task_struct *prev);
void (*end_context_switch)(struct task_struct *next);
#endif
@@ -178,16 +168,13 @@ struct pv_cpu_ops {
struct pv_irq_ops {
#ifdef CONFIG_PARAVIRT_XXL
/*
- * Get/set interrupt state. save_fl and restore_fl are only
- * expected to use X86_EFLAGS_IF; all other bits
- * returned from save_fl are undefined, and may be ignored by
- * restore_fl.
+ * Get/set interrupt state. save_fl is expected to use X86_EFLAGS_IF;
+ * all other bits returned from save_fl are undefined.
*
* NOTE: These functions callers expect the callee to preserve
* more registers than the standard C calling convention.
*/
struct paravirt_callee_save save_fl;
- struct paravirt_callee_save restore_fl;
struct paravirt_callee_save irq_disable;
struct paravirt_callee_save irq_enable;
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index b9a7fd0a27e2d..544f41a179fb6 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -261,8 +261,12 @@ struct x86_pmu_capability {
#define INTEL_PMC_IDX_TD_BAD_SPEC (INTEL_PMC_IDX_METRIC_BASE + 1)
#define INTEL_PMC_IDX_TD_FE_BOUND (INTEL_PMC_IDX_METRIC_BASE + 2)
#define INTEL_PMC_IDX_TD_BE_BOUND (INTEL_PMC_IDX_METRIC_BASE + 3)
-#define INTEL_PMC_IDX_METRIC_END INTEL_PMC_IDX_TD_BE_BOUND
-#define INTEL_PMC_MSK_TOPDOWN ((0xfull << INTEL_PMC_IDX_METRIC_BASE) | \
+#define INTEL_PMC_IDX_TD_HEAVY_OPS (INTEL_PMC_IDX_METRIC_BASE + 4)
+#define INTEL_PMC_IDX_TD_BR_MISPREDICT (INTEL_PMC_IDX_METRIC_BASE + 5)
+#define INTEL_PMC_IDX_TD_FETCH_LAT (INTEL_PMC_IDX_METRIC_BASE + 6)
+#define INTEL_PMC_IDX_TD_MEM_BOUND (INTEL_PMC_IDX_METRIC_BASE + 7)
+#define INTEL_PMC_IDX_METRIC_END INTEL_PMC_IDX_TD_MEM_BOUND
+#define INTEL_PMC_MSK_TOPDOWN ((0xffull << INTEL_PMC_IDX_METRIC_BASE) | \
INTEL_PMC_MSK_FIXED_SLOTS)
/*
@@ -280,8 +284,14 @@ struct x86_pmu_capability {
#define INTEL_TD_METRIC_BAD_SPEC 0x8100 /* Bad speculation metric */
#define INTEL_TD_METRIC_FE_BOUND 0x8200 /* FE bound metric */
#define INTEL_TD_METRIC_BE_BOUND 0x8300 /* BE bound metric */
-#define INTEL_TD_METRIC_MAX INTEL_TD_METRIC_BE_BOUND
-#define INTEL_TD_METRIC_NUM 4
+/* Level 2 metrics */
+#define INTEL_TD_METRIC_HEAVY_OPS 0x8400 /* Heavy Operations metric */
+#define INTEL_TD_METRIC_BR_MISPREDICT 0x8500 /* Branch Mispredict metric */
+#define INTEL_TD_METRIC_FETCH_LAT 0x8600 /* Fetch Latency metric */
+#define INTEL_TD_METRIC_MEM_BOUND 0x8700 /* Memory bound metric */
+
+#define INTEL_TD_METRIC_MAX INTEL_TD_METRIC_MEM_BOUND
+#define INTEL_TD_METRIC_NUM 8
static inline bool is_metric_idx(int idx)
{
@@ -483,11 +493,7 @@ static inline void perf_check_microcode(void) { }
extern struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr);
extern int x86_perf_get_lbr(struct x86_pmu_lbr *lbr);
#else
-static inline struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr)
-{
- *nr = 0;
- return NULL;
-}
+struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr);
static inline int x86_perf_get_lbr(struct x86_pmu_lbr *lbr)
{
return -1;
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 394757ee030a6..f24d7ef8fffae 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -177,8 +177,6 @@ enum page_cache_mode {
#define __pgprot(x) ((pgprot_t) { (x) } )
#define __pg(x) __pgprot(x)
-#define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE)
-
#define PAGE_NONE __pg( 0| 0| 0|___A| 0| 0| 0|___G)
#define PAGE_SHARED __pg(__PP|__RW|_USR|___A|__NX| 0| 0| 0)
#define PAGE_SHARED_EXEC __pg(__PP|__RW|_USR|___A| 0| 0| 0| 0)
diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h
index 69485ca13665f..f8cb8af4de5ce 100644
--- a/arch/x86/include/asm/preempt.h
+++ b/arch/x86/include/asm/preempt.h
@@ -5,6 +5,7 @@
#include <asm/rmwcc.h>
#include <asm/percpu.h>
#include <linux/thread_info.h>
+#include <linux/static_call_types.h>
DECLARE_PER_CPU(int, __preempt_count);
@@ -103,16 +104,45 @@ static __always_inline bool should_resched(int preempt_offset)
}
#ifdef CONFIG_PREEMPTION
- extern asmlinkage void preempt_schedule_thunk(void);
-# define __preempt_schedule() \
- asm volatile ("call preempt_schedule_thunk" : ASM_CALL_CONSTRAINT)
- extern asmlinkage void preempt_schedule(void);
- extern asmlinkage void preempt_schedule_notrace_thunk(void);
-# define __preempt_schedule_notrace() \
- asm volatile ("call preempt_schedule_notrace_thunk" : ASM_CALL_CONSTRAINT)
+extern asmlinkage void preempt_schedule(void);
+extern asmlinkage void preempt_schedule_thunk(void);
- extern asmlinkage void preempt_schedule_notrace(void);
-#endif
+#define __preempt_schedule_func preempt_schedule_thunk
+
+extern asmlinkage void preempt_schedule_notrace(void);
+extern asmlinkage void preempt_schedule_notrace_thunk(void);
+
+#define __preempt_schedule_notrace_func preempt_schedule_notrace_thunk
+
+#ifdef CONFIG_PREEMPT_DYNAMIC
+
+DECLARE_STATIC_CALL(preempt_schedule, __preempt_schedule_func);
+
+#define __preempt_schedule() \
+do { \
+ __STATIC_CALL_MOD_ADDRESSABLE(preempt_schedule); \
+ asm volatile ("call " STATIC_CALL_TRAMP_STR(preempt_schedule) : ASM_CALL_CONSTRAINT); \
+} while (0)
+
+DECLARE_STATIC_CALL(preempt_schedule_notrace, __preempt_schedule_notrace_func);
+
+#define __preempt_schedule_notrace() \
+do { \
+ __STATIC_CALL_MOD_ADDRESSABLE(preempt_schedule_notrace); \
+ asm volatile ("call " STATIC_CALL_TRAMP_STR(preempt_schedule_notrace) : ASM_CALL_CONSTRAINT); \
+} while (0)
+
+#else /* PREEMPT_DYNAMIC */
+
+#define __preempt_schedule() \
+ asm volatile ("call preempt_schedule_thunk" : ASM_CALL_CONSTRAINT);
+
+#define __preempt_schedule_notrace() \
+ asm volatile ("call preempt_schedule_notrace_thunk" : ASM_CALL_CONSTRAINT);
+
+#endif /* PREEMPT_DYNAMIC */
+
+#endif /* PREEMPTION */
#endif /* __ASM_PREEMPT_H */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index c20a52b5534b4..dc6d149bf851f 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -426,8 +426,6 @@ struct irq_stack {
char stack[IRQ_STACK_SIZE];
} __aligned(IRQ_STACK_SIZE);
-DECLARE_PER_CPU(struct irq_stack *, hardirq_stack_ptr);
-
#ifdef CONFIG_X86_32
DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack);
#else
@@ -454,7 +452,8 @@ static inline unsigned long cpu_kernelmode_gs_base(int cpu)
return (unsigned long)per_cpu(fixed_percpu_data.gs_base, cpu);
}
-DECLARE_PER_CPU(unsigned int, irq_count);
+DECLARE_PER_CPU(void *, hardirq_stack_ptr);
+DECLARE_PER_CPU(bool, hardirq_stack_inuse);
extern asmlinkage void ignore_sysret(void);
/* Save actual FS/GS selectors and bases to current->thread */
@@ -473,9 +472,9 @@ struct stack_canary {
};
DECLARE_PER_CPU_ALIGNED(struct stack_canary, stack_canary);
#endif
-/* Per CPU softirq stack pointer */
+DECLARE_PER_CPU(struct irq_stack *, hardirq_stack_ptr);
DECLARE_PER_CPU(struct irq_stack *, softirq_stack_ptr);
-#endif /* X86_64 */
+#endif /* !X86_64 */
extern unsigned int fpu_kernel_xstate_size;
extern unsigned int fpu_user_xstate_size;
diff --git a/arch/x86/include/asm/required-features.h b/arch/x86/include/asm/required-features.h
index 3ff0d48469f28..b2d504f119370 100644
--- a/arch/x86/include/asm/required-features.h
+++ b/arch/x86/include/asm/required-features.h
@@ -101,6 +101,7 @@
#define REQUIRED_MASK16 0
#define REQUIRED_MASK17 0
#define REQUIRED_MASK18 0
-#define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 19)
+#define REQUIRED_MASK19 0
+#define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 20)
#endif /* _ASM_X86_REQUIRED_FEATURES_H */
diff --git a/arch/x86/include/asm/resctrl.h b/arch/x86/include/asm/resctrl.h
index 07603064df8fc..d60ed0668a593 100644
--- a/arch/x86/include/asm/resctrl.h
+++ b/arch/x86/include/asm/resctrl.h
@@ -56,19 +56,22 @@ static void __resctrl_sched_in(void)
struct resctrl_pqr_state *state = this_cpu_ptr(&pqr_state);
u32 closid = state->default_closid;
u32 rmid = state->default_rmid;
+ u32 tmp;
/*
* If this task has a closid/rmid assigned, use it.
* Else use the closid/rmid assigned to this cpu.
*/
if (static_branch_likely(&rdt_alloc_enable_key)) {
- if (current->closid)
- closid = current->closid;
+ tmp = READ_ONCE(current->closid);
+ if (tmp)
+ closid = tmp;
}
if (static_branch_likely(&rdt_mon_enable_key)) {
- if (current->rmid)
- rmid = current->rmid;
+ tmp = READ_ONCE(current->rmid);
+ if (tmp)
+ rmid = tmp;
}
if (closid != state->cur_closid || rmid != state->cur_rmid) {
diff --git a/arch/x86/include/asm/softirq_stack.h b/arch/x86/include/asm/softirq_stack.h
new file mode 100644
index 0000000000000..889d53d6a0e12
--- /dev/null
+++ b/arch/x86/include/asm/softirq_stack.h
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_SOFTIRQ_STACK_H
+#define _ASM_X86_SOFTIRQ_STACK_H
+
+#ifdef CONFIG_X86_64
+# include <asm/irq_stack.h>
+#else
+# include <asm-generic/softirq_stack.h>
+#endif
+
+#endif
diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h
index cc177b4431ae8..1d3cbaef4bb71 100644
--- a/arch/x86/include/asm/special_insns.h
+++ b/arch/x86/include/asm/special_insns.h
@@ -243,10 +243,10 @@ static inline void serialize(void)
}
/* The dst parameter must be 64-bytes aligned */
-static inline void movdir64b(void *dst, const void *src)
+static inline void movdir64b(void __iomem *dst, const void *src)
{
const struct { char _[64]; } *__src = src;
- struct { char _[64]; } *__dst = dst;
+ struct { char _[64]; } __iomem *__dst = dst;
/*
* MOVDIR64B %(rdx), rax.
@@ -286,7 +286,7 @@ static inline void movdir64b(void *dst, const void *src)
static inline int enqcmds(void __iomem *dst, const void *src)
{
const struct { char _[64]; } *__src = src;
- struct { char _[64]; } *__dst = dst;
+ struct { char _[64]; } __iomem *__dst = dst;
int zf;
/*
diff --git a/arch/x86/include/asm/static_call.h b/arch/x86/include/asm/static_call.h
index c37f11999d0c0..cbb67b6030f97 100644
--- a/arch/x86/include/asm/static_call.h
+++ b/arch/x86/include/asm/static_call.h
@@ -37,4 +37,11 @@
#define ARCH_DEFINE_STATIC_CALL_NULL_TRAMP(name) \
__ARCH_DEFINE_STATIC_CALL_TRAMP(name, "ret; nop; nop; nop; nop")
+
+#define ARCH_ADD_TRAMP_KEY(name) \
+ asm(".pushsection .static_call_tramp_key, \"a\" \n" \
+ ".long " STATIC_CALL_TRAMP_STR(name) " - . \n" \
+ ".long " STATIC_CALL_KEY_STR(name) " - . \n" \
+ ".popsection \n")
+
#endif /* _ASM_STATIC_CALL_H */
diff --git a/arch/x86/include/asm/thermal.h b/arch/x86/include/asm/thermal.h
new file mode 100644
index 0000000000000..ddbdefd5b94f1
--- /dev/null
+++ b/arch/x86/include/asm/thermal.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_THERMAL_H
+#define _ASM_X86_THERMAL_H
+
+#ifdef CONFIG_X86_THERMAL_VECTOR
+void intel_init_thermal(struct cpuinfo_x86 *c);
+bool x86_thermal_enabled(void);
+void intel_thermal_interrupt(void);
+#else
+static inline void intel_init_thermal(struct cpuinfo_x86 *c) { }
+#endif
+
+#endif /* _ASM_X86_THERMAL_H */
diff --git a/arch/x86/include/asm/tlb.h b/arch/x86/include/asm/tlb.h
index 820082bd68804..1bfe979bb9bcd 100644
--- a/arch/x86/include/asm/tlb.h
+++ b/arch/x86/include/asm/tlb.h
@@ -4,7 +4,6 @@
#define tlb_start_vma(tlb, vma) do { } while (0)
#define tlb_end_vma(tlb, vma) do { } while (0)
-#define __tlb_remove_tlb_entry(tlb, ptep, address) do { } while (0)
#define tlb_flush tlb_flush
static inline void tlb_flush(struct mmu_gather *tlb);
diff --git a/arch/x86/include/asm/unwind_hints.h b/arch/x86/include/asm/unwind_hints.h
index 664d4610d700e..8e574c0afef80 100644
--- a/arch/x86/include/asm/unwind_hints.h
+++ b/arch/x86/include/asm/unwind_hints.h
@@ -48,17 +48,8 @@
UNWIND_HINT_REGS base=\base offset=\offset partial=1
.endm
-.macro UNWIND_HINT_FUNC sp_offset=8
- UNWIND_HINT sp_reg=ORC_REG_SP sp_offset=\sp_offset type=UNWIND_HINT_TYPE_CALL
-.endm
-
-/*
- * RET_OFFSET: Used on instructions that terminate a function; mostly RETURN
- * and sibling calls. On these, sp_offset denotes the expected offset from
- * initial_func_cfi.
- */
-.macro UNWIND_HINT_RET_OFFSET sp_offset=8
- UNWIND_HINT sp_reg=ORC_REG_SP type=UNWIND_HINT_TYPE_RET_OFFSET sp_offset=\sp_offset
+.macro UNWIND_HINT_FUNC
+ UNWIND_HINT sp_reg=ORC_REG_SP sp_offset=8 type=UNWIND_HINT_TYPE_FUNC
.endm
#endif /* __ASSEMBLY__ */
diff --git a/arch/x86/include/asm/vm86.h b/arch/x86/include/asm/vm86.h
index 26efbec94448d..9e8ac5073ecb8 100644
--- a/arch/x86/include/asm/vm86.h
+++ b/arch/x86/include/asm/vm86.h
@@ -36,7 +36,6 @@ struct vm86 {
unsigned long saved_sp0;
unsigned long flags;
- unsigned long screen_bitmap;
unsigned long cpu_type;
struct revectored_struct int_revectored;
struct revectored_struct int21_revectored;
diff --git a/arch/x86/include/uapi/asm/vm86.h b/arch/x86/include/uapi/asm/vm86.h
index d2ee4e307ef81..18909b8050bc5 100644
--- a/arch/x86/include/uapi/asm/vm86.h
+++ b/arch/x86/include/uapi/asm/vm86.h
@@ -97,7 +97,7 @@ struct revectored_struct {
struct vm86_struct {
struct vm86_regs regs;
unsigned long flags;
- unsigned long screen_bitmap;
+ unsigned long screen_bitmap; /* unused, preserved by vm86() */
unsigned long cpu_type;
struct revectored_struct int_revectored;
struct revectored_struct int21_revectored;
@@ -106,7 +106,7 @@ struct vm86_struct {
/*
* flags masks
*/
-#define VM86_SCREEN_BITMAP 0x0001
+#define VM86_SCREEN_BITMAP 0x0001 /* no longer supported */
struct vm86plus_info_struct {
unsigned long force_return_for_pic:1;
diff --git a/arch/x86/kernel/acpi/Makefile b/arch/x86/kernel/acpi/Makefile
index f1bb57b0e41ea..cf340d85946a8 100644
--- a/arch/x86/kernel/acpi/Makefile
+++ b/arch/x86/kernel/acpi/Makefile
@@ -1,5 +1,4 @@
# SPDX-License-Identifier: GPL-2.0
-OBJECT_FILES_NON_STANDARD_wakeup_$(BITS).o := y
obj-$(CONFIG_ACPI) += boot.o
obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup_$(BITS).o
diff --git a/arch/x86/kernel/acpi/wakeup_64.S b/arch/x86/kernel/acpi/wakeup_64.S
index 5d3a0b8fd3798..56b6865afb2ac 100644
--- a/arch/x86/kernel/acpi/wakeup_64.S
+++ b/arch/x86/kernel/acpi/wakeup_64.S
@@ -1,12 +1,14 @@
/* SPDX-License-Identifier: GPL-2.0-only */
.text
#include <linux/linkage.h>
+#include <linux/objtool.h>
#include <asm/segment.h>
#include <asm/pgtable_types.h>
#include <asm/page_types.h>
#include <asm/msr.h>
#include <asm/asm-offsets.h>
#include <asm/frame.h>
+#include <asm/nospec-branch.h>
# Copyright 2003 Pavel Machek <pavel@suse.cz
@@ -39,6 +41,7 @@ SYM_FUNC_START(wakeup_long64)
movq saved_rbp, %rbp
movq saved_rip, %rax
+ ANNOTATE_RETPOLINE_SAFE
jmp *%rax
SYM_FUNC_END(wakeup_long64)
@@ -126,6 +129,7 @@ SYM_FUNC_START(do_suspend_lowlevel)
FRAME_END
jmp restore_processor_state
SYM_FUNC_END(do_suspend_lowlevel)
+STACK_FRAME_NON_STANDARD do_suspend_lowlevel
.data
saved_rbp: .quad 0
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 7f4c081f59f0c..138e1435ac00a 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -2137,18 +2137,11 @@ void __init register_lapic_address(unsigned long address)
* Local APIC interrupts
*/
-/**
- * spurious_interrupt - Catch all for interrupts raised on unused vectors
- * @regs: Pointer to pt_regs on stack
- * @vector: The vector number
- *
- * This is invoked from ASM entry code to catch all interrupts which
- * trigger on an entry which is routed to the common_spurious idtentry
- * point.
- *
- * Also called from sysvec_spurious_apic_interrupt().
+/*
+ * Common handling code for spurious_interrupt and spurious_vector entry
+ * points below. No point in allowing the compiler to inline it twice.
*/
-DEFINE_IDTENTRY_IRQ(spurious_interrupt)
+static noinline void handle_spurious_interrupt(u8 vector)
{
u32 v;
@@ -2183,9 +2176,23 @@ out:
trace_spurious_apic_exit(vector);
}
+/**
+ * spurious_interrupt - Catch all for interrupts raised on unused vectors
+ * @regs: Pointer to pt_regs on stack
+ * @vector: The vector number
+ *
+ * This is invoked from ASM entry code to catch all interrupts which
+ * trigger on an entry which is routed to the common_spurious idtentry
+ * point.
+ */
+DEFINE_IDTENTRY_IRQ(spurious_interrupt)
+{
+ handle_spurious_interrupt(vector);
+}
+
DEFINE_IDTENTRY_SYSVEC(sysvec_spurious_apic_interrupt)
{
- __spurious_interrupt(regs, SPURIOUS_APIC_VECTOR);
+ handle_spurious_interrupt(SPURIOUS_APIC_VECTOR);
}
/*
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index 828be792231e9..b14533af76762 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -13,9 +13,6 @@ int main(void)
{
#ifdef CONFIG_PARAVIRT
#ifdef CONFIG_PARAVIRT_XXL
- OFFSET(PV_CPU_usergs_sysret64, paravirt_patch_template,
- cpu.usergs_sysret64);
- OFFSET(PV_CPU_swapgs, paravirt_patch_template, cpu.swapgs);
#ifdef CONFIG_DEBUG_ENTRY
OFFSET(PV_IRQ_save_fl, paravirt_patch_template, irq.save_fl);
#endif
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 35ad8480c464e..ab640abe26b68 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -960,6 +960,9 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
if (c->extended_cpuid_level >= 0x8000000a)
c->x86_capability[CPUID_8000_000A_EDX] = cpuid_edx(0x8000000a);
+ if (c->extended_cpuid_level >= 0x8000001f)
+ c->x86_capability[CPUID_8000_001F_EAX] = cpuid_eax(0x8000001f);
+
init_scattered_cpuid_features(c);
init_speculation_control(c);
@@ -1739,8 +1742,8 @@ DEFINE_PER_CPU(struct task_struct *, current_task) ____cacheline_aligned =
&init_task;
EXPORT_PER_CPU_SYMBOL(current_task);
-DEFINE_PER_CPU(struct irq_stack *, hardirq_stack_ptr);
-DEFINE_PER_CPU(unsigned int, irq_count) __visible = -1;
+DEFINE_PER_CPU(void *, hardirq_stack_ptr);
+DEFINE_PER_CPU(bool, hardirq_stack_inuse);
DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT;
EXPORT_PER_CPU_SYMBOL(__preempt_count);
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 816fdbec795a4..0e422a5448351 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -24,6 +24,7 @@
#include <asm/traps.h>
#include <asm/resctrl.h>
#include <asm/numa.h>
+#include <asm/thermal.h>
#ifdef CONFIG_X86_64
#include <linux/topology.h>
@@ -719,6 +720,8 @@ static void init_intel(struct cpuinfo_x86 *c)
tsx_disable();
split_lock_init();
+
+ intel_init_thermal(c);
}
#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/cpu/mce/Makefile b/arch/x86/kernel/cpu/mce/Makefile
index 9f020c9941545..015856abdbb19 100644
--- a/arch/x86/kernel/cpu/mce/Makefile
+++ b/arch/x86/kernel/cpu/mce/Makefile
@@ -9,8 +9,6 @@ obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o
mce-inject-y := inject.o
obj-$(CONFIG_X86_MCE_INJECT) += mce-inject.o
-obj-$(CONFIG_X86_THERMAL_VECTOR) += therm_throt.o
-
obj-$(CONFIG_ACPI_APEI) += apei.o
obj-$(CONFIG_X86_MCELOG_LEGACY) += dev-mcelog.o
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index e133ce1e562b3..7962355436dac 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -878,6 +878,12 @@ static atomic_t mce_executing;
static atomic_t mce_callin;
/*
+ * Track which CPUs entered the MCA broadcast synchronization and which not in
+ * order to print holdouts.
+ */
+static cpumask_t mce_missing_cpus = CPU_MASK_ALL;
+
+/*
* Check if a timeout waiting for other CPUs happened.
*/
static int mce_timed_out(u64 *t, const char *msg)
@@ -894,8 +900,12 @@ static int mce_timed_out(u64 *t, const char *msg)
if (!mca_cfg.monarch_timeout)
goto out;
if ((s64)*t < SPINUNIT) {
- if (mca_cfg.tolerant <= 1)
+ if (mca_cfg.tolerant <= 1) {
+ if (cpumask_and(&mce_missing_cpus, cpu_online_mask, &mce_missing_cpus))
+ pr_emerg("CPUs not responding to MCE broadcast (may include false positives): %*pbl\n",
+ cpumask_pr_args(&mce_missing_cpus));
mce_panic(msg, NULL, NULL);
+ }
cpu_missing = 1;
return 1;
}
@@ -1006,6 +1016,7 @@ static int mce_start(int *no_way_out)
* is updated before mce_callin.
*/
order = atomic_inc_return(&mce_callin);
+ cpumask_clear_cpu(smp_processor_id(), &mce_missing_cpus);
/*
* Wait for everyone.
@@ -1114,6 +1125,7 @@ static int mce_end(int order)
reset:
atomic_set(&global_nwo, 0);
atomic_set(&mce_callin, 0);
+ cpumask_setall(&mce_missing_cpus);
barrier();
/*
@@ -2178,7 +2190,6 @@ __setup("mce", mcheck_enable);
int __init mcheck_init(void)
{
- mcheck_intel_therm_init();
mce_register_decode_chain(&early_nb);
mce_register_decode_chain(&mce_uc_nb);
mce_register_decode_chain(&mce_default_nb);
@@ -2713,6 +2724,7 @@ static void mce_reset(void)
atomic_set(&mce_executing, 0);
atomic_set(&mce_callin, 0);
atomic_set(&global_nwo, 0);
+ cpumask_setall(&mce_missing_cpus);
}
static int fake_panic_get(void *data, u64 *val)
diff --git a/arch/x86/kernel/cpu/mce/intel.c b/arch/x86/kernel/cpu/mce/intel.c
index c2476fe0682e6..e309476743b74 100644
--- a/arch/x86/kernel/cpu/mce/intel.c
+++ b/arch/x86/kernel/cpu/mce/intel.c
@@ -531,7 +531,6 @@ static void intel_imc_init(struct cpuinfo_x86 *c)
void mce_intel_feature_init(struct cpuinfo_x86 *c)
{
- intel_init_thermal(c);
intel_init_cmci();
intel_init_lmce();
intel_ppin_init(c);
diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index ec6f0415bc6d1..b935e1b5f115e 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -830,7 +830,7 @@ static const struct attribute_group cpu_root_microcode_group = {
.attrs = cpu_root_microcode_attrs,
};
-int __init microcode_init(void)
+static int __init microcode_init(void)
{
struct cpuinfo_x86 *c = &boot_cpu_data;
int error;
diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c
index 5bd011737272d..9231640782fa2 100644
--- a/arch/x86/kernel/cpu/mtrr/cleanup.c
+++ b/arch/x86/kernel/cpu/mtrr/cleanup.c
@@ -537,9 +537,9 @@ static void __init print_out_mtrr_range_state(void)
if (!size_base)
continue;
- size_base = to_size_factor(size_base, &size_factor),
+ size_base = to_size_factor(size_base, &size_factor);
start_base = range_state[i].base_pfn << (PAGE_SHIFT - 10);
- start_base = to_size_factor(start_base, &start_factor),
+ start_base = to_size_factor(start_base, &start_factor);
type = range_state[i].type;
pr_debug("reg %d, base: %ld%cB, range: %ld%cB, type %s\n",
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index a29997e6cf9e6..b90f3f437765c 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -3,7 +3,6 @@
* This only handles 32bit MTRR on 32bit hosts. This is strictly wrong
* because MTRRs can span up to 40 bits (36bits on most modern x86)
*/
-#define DEBUG
#include <linux/export.h>
#include <linux/init.h>
diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.c b/arch/x86/kernel/cpu/mtrr/mtrr.c
index 61eb26edc6d20..28c8a23aa42ee 100644
--- a/arch/x86/kernel/cpu/mtrr/mtrr.c
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.c
@@ -31,8 +31,6 @@
System Programming Guide; Section 9.11. (1997 edition - PPro).
*/
-#define DEBUG
-
#include <linux/types.h> /* FIXME: kvm_para.h needs this */
#include <linux/stop_machine.h>
diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h
index ee71c47844cb1..c4d320d02fd5b 100644
--- a/arch/x86/kernel/cpu/resctrl/internal.h
+++ b/arch/x86/kernel/cpu/resctrl/internal.h
@@ -572,6 +572,7 @@ union cpuid_0x10_x_edx {
void rdt_last_cmd_clear(void);
void rdt_last_cmd_puts(const char *s);
+__printf(1, 2)
void rdt_last_cmd_printf(const char *fmt, ...);
void rdt_ctrl_update(void *arg);
diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
index 460f3e0df106c..f9190adc52cb9 100644
--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
@@ -563,11 +563,11 @@ static int __rdtgroup_move_task(struct task_struct *tsk,
*/
if (rdtgrp->type == RDTCTRL_GROUP) {
- tsk->closid = rdtgrp->closid;
- tsk->rmid = rdtgrp->mon.rmid;
+ WRITE_ONCE(tsk->closid, rdtgrp->closid);
+ WRITE_ONCE(tsk->rmid, rdtgrp->mon.rmid);
} else if (rdtgrp->type == RDTMON_GROUP) {
if (rdtgrp->mon.parent->closid == tsk->closid) {
- tsk->rmid = rdtgrp->mon.rmid;
+ WRITE_ONCE(tsk->rmid, rdtgrp->mon.rmid);
} else {
rdt_last_cmd_puts("Can't move task to different control group\n");
return -EINVAL;
@@ -2310,22 +2310,18 @@ static void rdt_move_group_tasks(struct rdtgroup *from, struct rdtgroup *to,
for_each_process_thread(p, t) {
if (!from || is_closid_match(t, from) ||
is_rmid_match(t, from)) {
- t->closid = to->closid;
- t->rmid = to->mon.rmid;
+ WRITE_ONCE(t->closid, to->closid);
+ WRITE_ONCE(t->rmid, to->mon.rmid);
-#ifdef CONFIG_SMP
/*
- * This is safe on x86 w/o barriers as the ordering
- * of writing to task_cpu() and t->on_cpu is
- * reverse to the reading here. The detection is
- * inaccurate as tasks might move or schedule
- * before the smp function call takes place. In
- * such a case the function call is pointless, but
+ * If the task is on a CPU, set the CPU in the mask.
+ * The detection is inaccurate as tasks might move or
+ * schedule before the smp function call takes place.
+ * In such a case the function call is pointless, but
* there is no other side effect.
*/
- if (mask && t->on_cpu)
+ if (IS_ENABLED(CONFIG_SMP) && mask && task_curr(t))
cpumask_set_cpu(task_cpu(t), mask);
-#endif
}
}
read_unlock(&tasklist_lock);
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index 236924930bf06..972ec3bfa9c0c 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -40,11 +40,6 @@ static const struct cpuid_bit cpuid_bits[] = {
{ X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 },
{ X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 },
{ X86_FEATURE_MBA, CPUID_EBX, 6, 0x80000008, 0 },
- { X86_FEATURE_SME, CPUID_EAX, 0, 0x8000001f, 0 },
- { X86_FEATURE_SEV, CPUID_EAX, 1, 0x8000001f, 0 },
- { X86_FEATURE_SEV_ES, CPUID_EAX, 3, 0x8000001f, 0 },
- { X86_FEATURE_SME_COHERENT, CPUID_EAX, 10, 0x8000001f, 0 },
- { X86_FEATURE_VM_PAGE_FLUSH, CPUID_EAX, 2, 0x8000001f, 0 },
{ 0, 0, 0, 0, 0 }
};
diff --git a/arch/x86/kernel/cpu/sgx/driver.c b/arch/x86/kernel/cpu/sgx/driver.c
index f2eac41bb4ff5..8ce6d8371cfbf 100644
--- a/arch/x86/kernel/cpu/sgx/driver.c
+++ b/arch/x86/kernel/cpu/sgx/driver.c
@@ -72,6 +72,9 @@ static int sgx_release(struct inode *inode, struct file *file)
synchronize_srcu(&encl->srcu);
mmu_notifier_unregister(&encl_mm->mmu_notifier, encl_mm->mm);
kfree(encl_mm);
+
+ /* 'encl_mm' is gone, put encl_mm->encl reference: */
+ kref_put(&encl->refcount, sgx_encl_release);
}
kref_put(&encl->refcount, sgx_encl_release);
diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c
index ee50a50102771..7449ef33f0819 100644
--- a/arch/x86/kernel/cpu/sgx/encl.c
+++ b/arch/x86/kernel/cpu/sgx/encl.c
@@ -141,7 +141,6 @@ static vm_fault_t sgx_vma_fault(struct vm_fault *vmf)
struct sgx_encl_page *entry;
unsigned long phys_addr;
struct sgx_encl *encl;
- unsigned long pfn;
vm_fault_t ret;
encl = vma->vm_private_data;
@@ -168,13 +167,6 @@ static vm_fault_t sgx_vma_fault(struct vm_fault *vmf)
phys_addr = sgx_get_epc_phys_addr(entry->epc_page);
- /* Check if another thread got here first to insert the PTE. */
- if (!follow_pfn(vma, addr, &pfn)) {
- mutex_unlock(&encl->lock);
-
- return VM_FAULT_NOPAGE;
- }
-
ret = vmf_insert_pfn(vma, addr, PFN_DOWN(phys_addr));
if (ret != VM_FAULT_NOPAGE) {
mutex_unlock(&encl->lock);
@@ -481,6 +473,9 @@ static void sgx_mmu_notifier_free(struct mmu_notifier *mn)
{
struct sgx_encl_mm *encl_mm = container_of(mn, struct sgx_encl_mm, mmu_notifier);
+ /* 'encl_mm' is going away, put encl_mm->encl reference: */
+ kref_put(&encl_mm->encl->refcount, sgx_encl_release);
+
kfree(encl_mm);
}
@@ -534,6 +529,8 @@ int sgx_encl_mm_add(struct sgx_encl *encl, struct mm_struct *mm)
if (!encl_mm)
return -ENOMEM;
+ /* Grab a refcount for the encl_mm->encl reference: */
+ kref_get(&encl->refcount);
encl_mm->encl = encl;
encl_mm->mm = mm;
encl_mm->mmu_notifier.ops = &sgx_mmu_notifier_ops;
diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c
index c519fc5f69480..8df81a3ed9457 100644
--- a/arch/x86/kernel/cpu/sgx/main.c
+++ b/arch/x86/kernel/cpu/sgx/main.c
@@ -700,25 +700,27 @@ static bool __init sgx_page_cache_init(void)
return true;
}
-static void __init sgx_init(void)
+static int __init sgx_init(void)
{
int ret;
int i;
if (!cpu_feature_enabled(X86_FEATURE_SGX))
- return;
+ return -ENODEV;
if (!sgx_page_cache_init())
- return;
+ return -ENOMEM;
- if (!sgx_page_reclaimer_init())
+ if (!sgx_page_reclaimer_init()) {
+ ret = -ENOMEM;
goto err_page_cache;
+ }
ret = sgx_drv_init();
if (ret)
goto err_kthread;
- return;
+ return 0;
err_kthread:
kthread_stop(ksgxd_tsk);
@@ -728,6 +730,8 @@ err_page_cache:
vfree(sgx_epc_sections[i].pages);
memunmap(sgx_epc_sections[i].virt_addr);
}
+
+ return ret;
}
device_initcall(sgx_init);
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 1dd851397bd90..5601b95944fae 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -128,12 +128,21 @@ static __always_inline bool in_exception_stack(unsigned long *stack, struct stac
static __always_inline bool in_irq_stack(unsigned long *stack, struct stack_info *info)
{
- unsigned long *end = (unsigned long *)this_cpu_read(hardirq_stack_ptr);
- unsigned long *begin = end - (IRQ_STACK_SIZE / sizeof(long));
+ unsigned long *end = (unsigned long *)this_cpu_read(hardirq_stack_ptr);
+ unsigned long *begin;
/*
- * This is a software stack, so 'end' can be a valid stack pointer.
- * It just means the stack is empty.
+ * @end points directly to the top most stack entry to avoid a -8
+ * adjustment in the stack switch hotpath. Adjust it back before
+ * calculating @begin.
+ */
+ end++;
+ begin = end - (IRQ_STACK_SIZE / sizeof(long));
+
+ /*
+ * Due to the switching logic RSP can never be == @end because the
+ * final operation is 'popq %rsp' which means after that RSP points
+ * to the original stack and not to @end.
*/
if (stack < begin || stack >= end)
return false;
@@ -143,8 +152,9 @@ static __always_inline bool in_irq_stack(unsigned long *stack, struct stack_info
info->end = end;
/*
- * The next stack pointer is the first thing pushed by the entry code
- * after switching to the irq stack.
+ * The next stack pointer is stored at the top of the irq stack
+ * before switching to the irq stack. Actual stack entries are all
+ * below that.
*/
info->next_sp = (unsigned long *)*(end - 1);
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 5d8047441a0aa..683749b80ae28 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -167,14 +167,14 @@ void fpstate_sanitize_xstate(struct fpu *fpu)
fx->fop = 0;
fx->rip = 0;
fx->rdp = 0;
- memset(&fx->st_space[0], 0, 128);
+ memset(fx->st_space, 0, sizeof(fx->st_space));
}
/*
* SSE is in init state
*/
if (!(xfeatures & XFEATURE_MASK_SSE))
- memset(&fx->xmm_space[0], 0, 256);
+ memset(fx->xmm_space, 0, sizeof(fx->xmm_space));
/*
* First two features are FPU and SSE, which above we handled
diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S
index 0d54099c2a3a3..7c273846c6877 100644
--- a/arch/x86/kernel/ftrace_64.S
+++ b/arch/x86/kernel/ftrace_64.S
@@ -184,6 +184,7 @@ SYM_INNER_LABEL(ftrace_graph_call, SYM_L_GLOBAL)
* It is also used to copy the retq for trampolines.
*/
SYM_INNER_LABEL_ALIGN(ftrace_stub, SYM_L_WEAK)
+ UNWIND_HINT_FUNC
retq
SYM_FUNC_END(ftrace_epilogue)
@@ -276,7 +277,7 @@ SYM_INNER_LABEL(ftrace_regs_caller_end, SYM_L_GLOBAL)
restore_mcount_regs 8
/* Restore flags */
popfq
- UNWIND_HINT_RET_OFFSET
+ UNWIND_HINT_FUNC
jmp ftrace_epilogue
SYM_FUNC_END(ftrace_regs_caller)
@@ -333,8 +334,7 @@ SYM_FUNC_START(ftrace_graph_caller)
retq
SYM_FUNC_END(ftrace_graph_caller)
-SYM_CODE_START(return_to_handler)
- UNWIND_HINT_EMPTY
+SYM_FUNC_START(return_to_handler)
subq $24, %rsp
/* Save the return values */
@@ -349,5 +349,5 @@ SYM_CODE_START(return_to_handler)
movq (%rsp), %rax
addq $24, %rsp
JMP_NOSPEC rdi
-SYM_CODE_END(return_to_handler)
+SYM_FUNC_END(return_to_handler)
#endif
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index c5dd50369e2f3..58aa712973ac8 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -21,6 +21,7 @@
#include <asm/hw_irq.h>
#include <asm/desc.h>
#include <asm/traps.h>
+#include <asm/thermal.h>
#define CREATE_TRACE_POINTS
#include <asm/trace/irq_vectors.h>
@@ -227,7 +228,7 @@ static __always_inline void handle_irq(struct irq_desc *desc,
struct pt_regs *regs)
{
if (IS_ENABLED(CONFIG_X86_64))
- run_irq_on_irqstack_cond(desc->handle_irq, desc, regs);
+ generic_handle_irq_desc(desc);
else
__handle_irq(desc, regs);
}
@@ -374,3 +375,23 @@ void fixup_irqs(void)
}
}
#endif
+
+#ifdef CONFIG_X86_THERMAL_VECTOR
+static void smp_thermal_vector(void)
+{
+ if (x86_thermal_enabled())
+ intel_thermal_interrupt();
+ else
+ pr_err("CPU%d: Unexpected LVT thermal interrupt!\n",
+ smp_processor_id());
+}
+
+DEFINE_IDTENTRY_SYSVEC(sysvec_thermal)
+{
+ trace_thermal_apic_entry(THERMAL_APIC_VECTOR);
+ inc_irq_stat(irq_thermal_count);
+ smp_thermal_vector();
+ trace_thermal_apic_exit(THERMAL_APIC_VECTOR);
+ ack_APIC_irq();
+}
+#endif
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 0b79efc87be52..044902d5a3c4a 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -22,6 +22,7 @@
#include <asm/apic.h>
#include <asm/nospec-branch.h>
+#include <asm/softirq_stack.h>
#ifdef CONFIG_DEBUG_STACKOVERFLOW
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index 440eed558558d..1c0fb96b9e390 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -20,6 +20,7 @@
#include <linux/sched/task_stack.h>
#include <asm/cpu_entry_area.h>
+#include <asm/softirq_stack.h>
#include <asm/irq_stack.h>
#include <asm/io_apic.h>
#include <asm/apic.h>
@@ -48,7 +49,8 @@ static int map_irq_stack(unsigned int cpu)
if (!va)
return -ENOMEM;
- per_cpu(hardirq_stack_ptr, cpu) = va + IRQ_STACK_SIZE;
+ /* Store actual TOS to avoid adjustment in the hotpath */
+ per_cpu(hardirq_stack_ptr, cpu) = va + IRQ_STACK_SIZE - 8;
return 0;
}
#else
@@ -60,7 +62,8 @@ static int map_irq_stack(unsigned int cpu)
{
void *va = per_cpu_ptr(&irq_stack_backing_store, cpu);
- per_cpu(hardirq_stack_ptr, cpu) = va + IRQ_STACK_SIZE;
+ /* Store actual TOS to avoid adjustment in the hotpath */
+ per_cpu(hardirq_stack_ptr, cpu) = va + IRQ_STACK_SIZE - 8;
return 0;
}
#endif
@@ -71,8 +74,3 @@ int irq_init_percpu_irqstack(unsigned int cpu)
return 0;
return map_irq_stack(cpu);
}
-
-void do_softirq_own_stack(void)
-{
- run_on_irqstack_cond(__do_softirq, NULL);
-}
diff --git a/arch/x86/kernel/irqflags.S b/arch/x86/kernel/irqflags.S
index 0db0375235b48..8ef35063964b1 100644
--- a/arch/x86/kernel/irqflags.S
+++ b/arch/x86/kernel/irqflags.S
@@ -13,14 +13,3 @@ SYM_FUNC_START(native_save_fl)
ret
SYM_FUNC_END(native_save_fl)
EXPORT_SYMBOL(native_save_fl)
-
-/*
- * void native_restore_fl(unsigned long flags)
- * %eax/%rdi: flags
- */
-SYM_FUNC_START(native_restore_fl)
- push %_ASM_ARG1
- popf
- ret
-SYM_FUNC_END(native_restore_fl)
-EXPORT_SYMBOL(native_restore_fl)
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index a65e9e97857f8..df776cdca327d 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -133,26 +133,6 @@ void synthesize_relcall(void *dest, void *from, void *to)
NOKPROBE_SYMBOL(synthesize_relcall);
/*
- * Skip the prefixes of the instruction.
- */
-static kprobe_opcode_t *skip_prefixes(kprobe_opcode_t *insn)
-{
- insn_attr_t attr;
-
- attr = inat_get_opcode_attribute((insn_byte_t)*insn);
- while (inat_is_legacy_prefix(attr)) {
- insn++;
- attr = inat_get_opcode_attribute((insn_byte_t)*insn);
- }
-#ifdef CONFIG_X86_64
- if (inat_is_rex_prefix(attr))
- insn++;
-#endif
- return insn;
-}
-NOKPROBE_SYMBOL(skip_prefixes);
-
-/*
* Returns non-zero if INSN is boostable.
* RIP relative instructions are adjusted at copying time in 64 bits mode
*/
@@ -312,25 +292,6 @@ static int can_probe(unsigned long paddr)
}
/*
- * Returns non-zero if opcode modifies the interrupt flag.
- */
-static int is_IF_modifier(kprobe_opcode_t *insn)
-{
- /* Skip prefixes */
- insn = skip_prefixes(insn);
-
- switch (*insn) {
- case 0xfa: /* cli */
- case 0xfb: /* sti */
- case 0xcf: /* iret/iretd */
- case 0x9d: /* popf/popfd */
- return 1;
- }
-
- return 0;
-}
-
-/*
* Copy an instruction with recovering modified instruction by kprobes
* and adjust the displacement if the instruction uses the %rip-relative
* addressing mode. Note that since @real will be the final place of copied
@@ -411,9 +372,9 @@ static int prepare_boost(kprobe_opcode_t *buf, struct kprobe *p,
synthesize_reljump(buf + len, p->ainsn.insn + len,
p->addr + insn->length);
len += JMP32_INSN_SIZE;
- p->ainsn.boostable = true;
+ p->ainsn.boostable = 1;
} else {
- p->ainsn.boostable = false;
+ p->ainsn.boostable = 0;
}
return len;
@@ -450,6 +411,67 @@ void free_insn_page(void *page)
module_memfree(page);
}
+static void set_resume_flags(struct kprobe *p, struct insn *insn)
+{
+ insn_byte_t opcode = insn->opcode.bytes[0];
+
+ switch (opcode) {
+ case 0xfa: /* cli */
+ case 0xfb: /* sti */
+ case 0x9d: /* popf/popfd */
+ /* Check whether the instruction modifies Interrupt Flag or not */
+ p->ainsn.if_modifier = 1;
+ break;
+ case 0x9c: /* pushfl */
+ p->ainsn.is_pushf = 1;
+ break;
+ case 0xcf: /* iret */
+ p->ainsn.if_modifier = 1;
+ fallthrough;
+ case 0xc2: /* ret/lret */
+ case 0xc3:
+ case 0xca:
+ case 0xcb:
+ case 0xea: /* jmp absolute -- ip is correct */
+ /* ip is already adjusted, no more changes required */
+ p->ainsn.is_abs_ip = 1;
+ /* Without resume jump, this is boostable */
+ p->ainsn.boostable = 1;
+ break;
+ case 0xe8: /* call relative - Fix return addr */
+ p->ainsn.is_call = 1;
+ break;
+#ifdef CONFIG_X86_32
+ case 0x9a: /* call absolute -- same as call absolute, indirect */
+ p->ainsn.is_call = 1;
+ p->ainsn.is_abs_ip = 1;
+ break;
+#endif
+ case 0xff:
+ opcode = insn->opcode.bytes[1];
+ if ((opcode & 0x30) == 0x10) {
+ /*
+ * call absolute, indirect
+ * Fix return addr; ip is correct.
+ * But this is not boostable
+ */
+ p->ainsn.is_call = 1;
+ p->ainsn.is_abs_ip = 1;
+ break;
+ } else if (((opcode & 0x31) == 0x20) ||
+ ((opcode & 0x31) == 0x21)) {
+ /*
+ * jmp near and far, absolute indirect
+ * ip is correct.
+ */
+ p->ainsn.is_abs_ip = 1;
+ /* Without resume jump, this is boostable */
+ p->ainsn.boostable = 1;
+ }
+ break;
+ }
+}
+
static int arch_copy_kprobe(struct kprobe *p)
{
struct insn insn;
@@ -467,8 +489,8 @@ static int arch_copy_kprobe(struct kprobe *p)
*/
len = prepare_boost(buf, p, &insn);
- /* Check whether the instruction modifies Interrupt Flag or not */
- p->ainsn.if_modifier = is_IF_modifier(buf);
+ /* Analyze the opcode and set resume flags */
+ set_resume_flags(p, &insn);
/* Also, displacement change doesn't affect the first byte */
p->opcode = buf[0];
@@ -491,6 +513,9 @@ int arch_prepare_kprobe(struct kprobe *p)
if (!can_probe((unsigned long)p->addr))
return -EILSEQ;
+
+ memset(&p->ainsn, 0, sizeof(p->ainsn));
+
/* insn: must be on special executable page on x86. */
p->ainsn.insn = get_insn_slot();
if (!p->ainsn.insn)
@@ -806,11 +831,6 @@ NOKPROBE_SYMBOL(trampoline_handler);
* 2) If the single-stepped instruction was a call, the return address
* that is atop the stack is the address following the copied instruction.
* We need to make it the address following the original instruction.
- *
- * If this is the first time we've single-stepped the instruction at
- * this probepoint, and the instruction is boostable, boost it: add a
- * jump instruction after the copied instruction, that jumps to the next
- * instruction after the probepoint.
*/
static void resume_execution(struct kprobe *p, struct pt_regs *regs,
struct kprobe_ctlblk *kcb)
@@ -818,60 +838,20 @@ static void resume_execution(struct kprobe *p, struct pt_regs *regs,
unsigned long *tos = stack_addr(regs);
unsigned long copy_ip = (unsigned long)p->ainsn.insn;
unsigned long orig_ip = (unsigned long)p->addr;
- kprobe_opcode_t *insn = p->ainsn.insn;
-
- /* Skip prefixes */
- insn = skip_prefixes(insn);
regs->flags &= ~X86_EFLAGS_TF;
- switch (*insn) {
- case 0x9c: /* pushfl */
+
+ /* Fixup the contents of top of stack */
+ if (p->ainsn.is_pushf) {
*tos &= ~(X86_EFLAGS_TF | X86_EFLAGS_IF);
*tos |= kcb->kprobe_old_flags;
- break;
- case 0xc2: /* iret/ret/lret */
- case 0xc3:
- case 0xca:
- case 0xcb:
- case 0xcf:
- case 0xea: /* jmp absolute -- ip is correct */
- /* ip is already adjusted, no more changes required */
- p->ainsn.boostable = true;
- goto no_change;
- case 0xe8: /* call relative - Fix return addr */
+ } else if (p->ainsn.is_call) {
*tos = orig_ip + (*tos - copy_ip);
- break;
-#ifdef CONFIG_X86_32
- case 0x9a: /* call absolute -- same as call absolute, indirect */
- *tos = orig_ip + (*tos - copy_ip);
- goto no_change;
-#endif
- case 0xff:
- if ((insn[1] & 0x30) == 0x10) {
- /*
- * call absolute, indirect
- * Fix return addr; ip is correct.
- * But this is not boostable
- */
- *tos = orig_ip + (*tos - copy_ip);
- goto no_change;
- } else if (((insn[1] & 0x31) == 0x20) ||
- ((insn[1] & 0x31) == 0x21)) {
- /*
- * jmp near and far, absolute indirect
- * ip is correct. And this is boostable
- */
- p->ainsn.boostable = true;
- goto no_change;
- }
- break;
- default:
- break;
}
- regs->ip += orig_ip - copy_ip;
+ if (!p->ainsn.is_abs_ip)
+ regs->ip += orig_ip - copy_ip;
-no_change:
restore_btf();
}
NOKPROBE_SYMBOL(resume_execution);
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index b8aee71840ae5..aa15132228da5 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -398,9 +398,15 @@ static void free_ldt_pgtables(struct mm_struct *mm)
if (!boot_cpu_has(X86_FEATURE_PTI))
return;
- tlb_gather_mmu(&tlb, mm, start, end);
+ /*
+ * Although free_pgd_range() is intended for freeing user
+ * page-tables, it also works out for kernel mappings on x86.
+ * We use tlb_gather_mmu_fullmm() to avoid confusing the
+ * range-tracking logic in __tlb_adjust_range().
+ */
+ tlb_gather_mmu_fullmm(&tlb, mm);
free_pgd_range(&tlb, start, end, start, end);
- tlb_finish_mmu(&tlb, start, end);
+ tlb_finish_mmu(&tlb);
#endif
}
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index 34b153cbd4acb..5e9a34b5bd741 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -114,6 +114,7 @@ int apply_relocate(Elf32_Shdr *sechdrs,
*location += sym->st_value;
break;
case R_386_PC32:
+ case R_386_PLT32:
/* Add the value, subtract its position */
*location += sym->st_value - (uint32_t)location;
break;
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index 8a67d1fa8dc58..ed8ac6bcbafb2 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -182,6 +182,13 @@ static long msr_ioctl(struct file *file, unsigned int ioc, unsigned long arg)
err = security_locked_down(LOCKDOWN_MSR);
if (err)
break;
+
+ err = filter_write(regs[1]);
+ if (err)
+ return err;
+
+ add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK);
+
err = wrmsr_safe_regs_on_cpu(cpu, regs);
if (err)
break;
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 6c3407ba6ee98..c60222ab8ab9b 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -135,8 +135,7 @@ unsigned paravirt_patch_default(u8 type, void *insn_buff,
else if (opfunc == _paravirt_ident_64)
ret = paravirt_patch_ident_64(insn_buff, len);
- else if (type == PARAVIRT_PATCH(cpu.iret) ||
- type == PARAVIRT_PATCH(cpu.usergs_sysret64))
+ else if (type == PARAVIRT_PATCH(cpu.iret))
/* If operation requires a jmp, then jmp */
ret = paravirt_patch_jmp(insn_buff, opfunc, addr, len);
#endif
@@ -170,7 +169,6 @@ static u64 native_steal_clock(int cpu)
/* These are in entry.S */
extern void native_iret(void);
-extern void native_usergs_sysret64(void);
static struct resource reserve_ioports = {
.start = 0,
@@ -310,9 +308,7 @@ struct paravirt_patch_template pv_ops = {
.cpu.load_sp0 = native_load_sp0,
- .cpu.usergs_sysret64 = native_usergs_sysret64,
.cpu.iret = native_iret,
- .cpu.swapgs = native_swapgs,
#ifdef CONFIG_X86_IOPL_IOPERM
.cpu.invalidate_io_bitmap = native_tss_invalidate_io_bitmap,
@@ -324,7 +320,6 @@ struct paravirt_patch_template pv_ops = {
/* Irq ops. */
.irq.save_fl = __PV_IS_CALLEE_SAVE(native_save_fl),
- .irq.restore_fl = __PV_IS_CALLEE_SAVE(native_restore_fl),
.irq.irq_disable = __PV_IS_CALLEE_SAVE(native_irq_disable),
.irq.irq_enable = __PV_IS_CALLEE_SAVE(native_irq_enable),
.irq.safe_halt = native_safe_halt,
diff --git a/arch/x86/kernel/paravirt_patch.c b/arch/x86/kernel/paravirt_patch.c
index ace6e334cb393..abd27ec67397c 100644
--- a/arch/x86/kernel/paravirt_patch.c
+++ b/arch/x86/kernel/paravirt_patch.c
@@ -25,10 +25,7 @@ struct patch_xxl {
const unsigned char mmu_read_cr2[3];
const unsigned char mmu_read_cr3[3];
const unsigned char mmu_write_cr3[3];
- const unsigned char irq_restore_fl[2];
const unsigned char cpu_wbinvd[2];
- const unsigned char cpu_usergs_sysret64[6];
- const unsigned char cpu_swapgs[3];
const unsigned char mov64[3];
};
@@ -39,11 +36,7 @@ static const struct patch_xxl patch_data_xxl = {
.mmu_read_cr2 = { 0x0f, 0x20, 0xd0 }, // mov %cr2, %[re]ax
.mmu_read_cr3 = { 0x0f, 0x20, 0xd8 }, // mov %cr3, %[re]ax
.mmu_write_cr3 = { 0x0f, 0x22, 0xdf }, // mov %rdi, %cr3
- .irq_restore_fl = { 0x57, 0x9d }, // push %rdi; popfq
.cpu_wbinvd = { 0x0f, 0x09 }, // wbinvd
- .cpu_usergs_sysret64 = { 0x0f, 0x01, 0xf8,
- 0x48, 0x0f, 0x07 }, // swapgs; sysretq
- .cpu_swapgs = { 0x0f, 0x01, 0xf8 }, // swapgs
.mov64 = { 0x48, 0x89, 0xf8 }, // mov %rdi, %rax
};
@@ -76,7 +69,6 @@ unsigned int native_patch(u8 type, void *insn_buff, unsigned long addr,
switch (type) {
#ifdef CONFIG_PARAVIRT_XXL
- PATCH_CASE(irq, restore_fl, xxl, insn_buff, len);
PATCH_CASE(irq, save_fl, xxl, insn_buff, len);
PATCH_CASE(irq, irq_enable, xxl, insn_buff, len);
PATCH_CASE(irq, irq_disable, xxl, insn_buff, len);
@@ -85,8 +77,6 @@ unsigned int native_patch(u8 type, void *insn_buff, unsigned long addr,
PATCH_CASE(mmu, read_cr3, xxl, insn_buff, len);
PATCH_CASE(mmu, write_cr3, xxl, insn_buff, len);
- PATCH_CASE(cpu, usergs_sysret64, xxl, insn_buff, len);
- PATCH_CASE(cpu, swapgs, xxl, insn_buff, len);
PATCH_CASE(cpu, wbinvd, xxl, insn_buff, len);
#endif
diff --git a/arch/x86/kernel/pci-iommu_table.c b/arch/x86/kernel/pci-iommu_table.c
index 2e9006c1e2408..42e92ec62973b 100644
--- a/arch/x86/kernel/pci-iommu_table.c
+++ b/arch/x86/kernel/pci-iommu_table.c
@@ -4,9 +4,6 @@
#include <linux/string.h>
#include <linux/kallsyms.h>
-
-#define DEBUG 1
-
static struct iommu_table_entry * __init
find_dependents_of(struct iommu_table_entry *start,
struct iommu_table_entry *finish,
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index ad582f9ac5a6f..d08307df69ad5 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -539,7 +539,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
int cpu = smp_processor_id();
WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) &&
- this_cpu_read(irq_count) != -1);
+ this_cpu_read(hardirq_stack_inuse));
if (!test_thread_flag(TIF_NEED_FPU_LOAD))
switch_fpu_prepare(prev_fpu, cpu);
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index bedca011459cc..87a4143aa7d7c 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -704,6 +704,9 @@ void ptrace_disable(struct task_struct *child)
#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
static const struct user_regset_view user_x86_32_view; /* Initialized below. */
#endif
+#ifdef CONFIG_X86_64
+static const struct user_regset_view user_x86_64_view; /* Initialized below. */
+#endif
long arch_ptrace(struct task_struct *child, long request,
unsigned long addr, unsigned long data)
@@ -711,6 +714,14 @@ long arch_ptrace(struct task_struct *child, long request,
int ret;
unsigned long __user *datap = (unsigned long __user *)data;
+#ifdef CONFIG_X86_64
+ /* This is native 64-bit ptrace() */
+ const struct user_regset_view *regset_view = &user_x86_64_view;
+#else
+ /* This is native 32-bit ptrace() */
+ const struct user_regset_view *regset_view = &user_x86_32_view;
+#endif
+
switch (request) {
/* read the word at location addr in the USER area. */
case PTRACE_PEEKUSR: {
@@ -749,28 +760,28 @@ long arch_ptrace(struct task_struct *child, long request,
case PTRACE_GETREGS: /* Get all gp regs from the child. */
return copy_regset_to_user(child,
- task_user_regset_view(current),
+ regset_view,
REGSET_GENERAL,
0, sizeof(struct user_regs_struct),
datap);
case PTRACE_SETREGS: /* Set all gp regs in the child. */
return copy_regset_from_user(child,
- task_user_regset_view(current),
+ regset_view,
REGSET_GENERAL,
0, sizeof(struct user_regs_struct),
datap);
case PTRACE_GETFPREGS: /* Get the child FPU state. */
return copy_regset_to_user(child,
- task_user_regset_view(current),
+ regset_view,
REGSET_FP,
0, sizeof(struct user_i387_struct),
datap);
case PTRACE_SETFPREGS: /* Set the child FPU state. */
return copy_regset_from_user(child,
- task_user_regset_view(current),
+ regset_view,
REGSET_FP,
0, sizeof(struct user_i387_struct),
datap);
@@ -1152,28 +1163,28 @@ static long x32_arch_ptrace(struct task_struct *child,
case PTRACE_GETREGS: /* Get all gp regs from the child. */
return copy_regset_to_user(child,
- task_user_regset_view(current),
+ &user_x86_64_view,
REGSET_GENERAL,
0, sizeof(struct user_regs_struct),
datap);
case PTRACE_SETREGS: /* Set all gp regs in the child. */
return copy_regset_from_user(child,
- task_user_regset_view(current),
+ &user_x86_64_view,
REGSET_GENERAL,
0, sizeof(struct user_regs_struct),
datap);
case PTRACE_GETFPREGS: /* Get the child FPU state. */
return copy_regset_to_user(child,
- task_user_regset_view(current),
+ &user_x86_64_view,
REGSET_FP,
0, sizeof(struct user_i387_struct),
datap);
case PTRACE_SETFPREGS: /* Set the child FPU state. */
return copy_regset_from_user(child,
- task_user_regset_view(current),
+ &user_x86_64_view,
REGSET_FP,
0, sizeof(struct user_i387_struct),
datap);
@@ -1309,6 +1320,25 @@ void __init update_regset_xstate_info(unsigned int size, u64 xstate_mask)
xstate_fx_sw_bytes[USER_XSTATE_XCR0_WORD] = xstate_mask;
}
+/*
+ * This is used by the core dump code to decide which regset to dump. The
+ * core dump code writes out the resulting .e_machine and the corresponding
+ * regsets. This is suboptimal if the task is messing around with its CS.L
+ * field, but at worst the core dump will end up missing some information.
+ *
+ * Unfortunately, it is also used by the broken PTRACE_GETREGSET and
+ * PTRACE_SETREGSET APIs. These APIs look at the .regsets field but have
+ * no way to make sure that the e_machine they use matches the caller's
+ * expectations. The result is that the data format returned by
+ * PTRACE_GETREGSET depends on the returned CS field (and even the offset
+ * of the returned CS field depends on its value!) and the data format
+ * accepted by PTRACE_SETREGSET is determined by the old CS value. The
+ * upshot is that it is basically impossible to use these APIs correctly.
+ *
+ * The best way to fix it in the long run would probably be to add new
+ * improved ptrace() APIs to read and write registers reliably, possibly by
+ * allowing userspace to select the ELF e_machine variant that they expect.
+ */
const struct user_regset_view *task_user_regset_view(struct task_struct *task)
{
#ifdef CONFIG_IA32_EMULATION
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index db115943e8bdc..9991c5920aace 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -477,6 +477,15 @@ static const struct dmi_system_id reboot_dmi_table[] __initconst = {
},
},
+ { /* PCIe Wifi card isn't detected after reboot otherwise */
+ .callback = set_pci_reboot,
+ .ident = "Zotac ZBOX CI327 nano",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "NA"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "ZBOX-CI327NANO-GS-01"),
+ },
+ },
+
/* Sony */
{ /* Handle problems with rebooting on Sony VGN-Z540N */
.callback = set_bios_reboot,
diff --git a/arch/x86/kernel/static_call.c b/arch/x86/kernel/static_call.c
index ca9a380d9c0b3..9442c4136c387 100644
--- a/arch/x86/kernel/static_call.c
+++ b/arch/x86/kernel/static_call.c
@@ -11,14 +11,26 @@ enum insn_type {
RET = 3, /* tramp / site cond-tail-call */
};
+/*
+ * data16 data16 xorq %rax, %rax - a single 5 byte instruction that clears %rax
+ * The REX.W cancels the effect of any data16.
+ */
+static const u8 xor5rax[] = { 0x66, 0x66, 0x48, 0x31, 0xc0 };
+
static void __ref __static_call_transform(void *insn, enum insn_type type, void *func)
{
+ const void *emulate = NULL;
int size = CALL_INSN_SIZE;
const void *code;
switch (type) {
case CALL:
code = text_gen_insn(CALL_INSN_OPCODE, insn, func);
+ if (func == &__static_call_return0) {
+ emulate = code;
+ code = &xor5rax;
+ }
+
break;
case NOP:
@@ -41,7 +53,7 @@ static void __ref __static_call_transform(void *insn, enum insn_type type, void
if (unlikely(system_state == SYSTEM_BOOTING))
return text_poke_early(insn, code, size);
- text_poke_bp(insn, code, size, NULL);
+ text_poke_bp(insn, code, size, emulate);
}
static void __static_call_validate(void *insn, bool tail)
@@ -54,7 +66,8 @@ static void __static_call_validate(void *insn, bool tail)
return;
} else {
if (opcode == CALL_INSN_OPCODE ||
- !memcmp(insn, ideal_nops[NOP_ATOMIC5], 5))
+ !memcmp(insn, ideal_nops[NOP_ATOMIC5], 5) ||
+ !memcmp(insn, xor5rax, 5))
return;
}
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
index 504fa5425bcec..660b78827638f 100644
--- a/arch/x86/kernel/sys_x86_64.c
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -90,14 +90,10 @@ SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len,
unsigned long, prot, unsigned long, flags,
unsigned long, fd, unsigned long, off)
{
- long error;
- error = -EINVAL;
if (off & ~PAGE_MASK)
- goto out;
+ return -EINVAL;
- error = ksys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT);
-out:
- return error;
+ return ksys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT);
}
static void find_start_end(unsigned long addr, unsigned long flags,
diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c
index 73f8001000669..2a1d47f47eee2 100644
--- a/arch/x86/kernel/unwind_orc.c
+++ b/arch/x86/kernel/unwind_orc.c
@@ -471,7 +471,7 @@ bool unwind_next_frame(struct unwind_state *state)
break;
case ORC_REG_SP_INDIRECT:
- sp = state->sp + orc->sp_offset;
+ sp = state->sp;
indirect = true;
break;
@@ -521,6 +521,9 @@ bool unwind_next_frame(struct unwind_state *state)
if (indirect) {
if (!deref_stack_reg(state, sp, &sp))
goto err;
+
+ if (orc->sp_reg == ORC_REG_SP_INDIRECT)
+ sp += orc->sp_offset;
}
/* Find IP, SP and possibly regs: */
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 764573de3996d..e5a7a10a0164d 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -134,7 +134,11 @@ void save_v86_state(struct kernel_vm86_regs *regs, int retval)
unsafe_put_user(regs->ds, &user->regs.ds, Efault_end);
unsafe_put_user(regs->fs, &user->regs.fs, Efault_end);
unsafe_put_user(regs->gs, &user->regs.gs, Efault_end);
- unsafe_put_user(vm86->screen_bitmap, &user->screen_bitmap, Efault_end);
+
+ /*
+ * Don't write screen_bitmap in case some user had a value there
+ * and expected it to remain unchanged.
+ */
user_access_end();
@@ -160,49 +164,6 @@ Efault:
do_exit(SIGSEGV);
}
-static void mark_screen_rdonly(struct mm_struct *mm)
-{
- struct vm_area_struct *vma;
- spinlock_t *ptl;
- pgd_t *pgd;
- p4d_t *p4d;
- pud_t *pud;
- pmd_t *pmd;
- pte_t *pte;
- int i;
-
- mmap_write_lock(mm);
- pgd = pgd_offset(mm, 0xA0000);
- if (pgd_none_or_clear_bad(pgd))
- goto out;
- p4d = p4d_offset(pgd, 0xA0000);
- if (p4d_none_or_clear_bad(p4d))
- goto out;
- pud = pud_offset(p4d, 0xA0000);
- if (pud_none_or_clear_bad(pud))
- goto out;
- pmd = pmd_offset(pud, 0xA0000);
-
- if (pmd_trans_huge(*pmd)) {
- vma = find_vma(mm, 0xA0000);
- split_huge_pmd(vma, pmd, 0xA0000);
- }
- if (pmd_none_or_clear_bad(pmd))
- goto out;
- pte = pte_offset_map_lock(mm, pmd, 0xA0000, &ptl);
- for (i = 0; i < 32; i++) {
- if (pte_present(*pte))
- set_pte(pte, pte_wrprotect(*pte));
- pte++;
- }
- pte_unmap_unlock(pte, ptl);
-out:
- mmap_write_unlock(mm);
- flush_tlb_mm_range(mm, 0xA0000, 0xA0000 + 32*PAGE_SIZE, PAGE_SHIFT, false);
-}
-
-
-
static int do_vm86_irq_handling(int subfunction, int irqnumber);
static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus);
@@ -282,6 +243,15 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus)
offsetof(struct vm86_struct, int_revectored)))
return -EFAULT;
+
+ /* VM86_SCREEN_BITMAP had numerous bugs and appears to have no users. */
+ if (v.flags & VM86_SCREEN_BITMAP) {
+ char comm[TASK_COMM_LEN];
+
+ pr_info_once("vm86: '%s' uses VM86_SCREEN_BITMAP, which is no longer supported\n", get_task_comm(comm, current));
+ return -EINVAL;
+ }
+
memset(&vm86regs, 0, sizeof(vm86regs));
vm86regs.pt.bx = v.regs.ebx;
@@ -302,7 +272,6 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus)
vm86regs.gs = v.regs.gs;
vm86->flags = v.flags;
- vm86->screen_bitmap = v.screen_bitmap;
vm86->cpu_type = v.cpu_type;
if (copy_from_user(&vm86->int_revectored,
@@ -370,9 +339,6 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus)
update_task_stack(tsk);
preempt_enable();
- if (vm86->flags & VM86_SCREEN_BITMAP)
- mark_screen_rdonly(tsk->mm);
-
memcpy((struct kernel_vm86_regs *)regs, &vm86regs, sizeof(vm86regs));
return regs->ax;
}
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 1b404e4d7dd8e..b967c1c774a1f 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1782,6 +1782,7 @@ EXPORT_SYMBOL_GPL(kvm_emulate_wrmsr);
bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu)
{
+ xfer_to_guest_mode_prepare();
return vcpu->mode == EXITING_GUEST_MODE || kvm_request_pending(vcpu) ||
xfer_to_guest_mode_work_pending();
}
diff --git a/arch/x86/lib/insn.c b/arch/x86/lib/insn.c
index 404279563891a..435630a6ec970 100644
--- a/arch/x86/lib/insn.c
+++ b/arch/x86/lib/insn.c
@@ -5,6 +5,7 @@
* Copyright (C) IBM Corporation, 2002, 2004, 2009
*/
+#include <linux/kernel.h>
#ifdef __KERNEL__
#include <linux/string.h>
#else
@@ -15,15 +16,28 @@
#include <asm/emulate_prefix.h>
+#define leXX_to_cpu(t, r) \
+({ \
+ __typeof__(t) v; \
+ switch (sizeof(t)) { \
+ case 4: v = le32_to_cpu(r); break; \
+ case 2: v = le16_to_cpu(r); break; \
+ case 1: v = r; break; \
+ default: \
+ BUILD_BUG(); break; \
+ } \
+ v; \
+})
+
/* Verify next sizeof(t) bytes can be on the same instruction */
#define validate_next(t, insn, n) \
((insn)->next_byte + sizeof(t) + n <= (insn)->end_kaddr)
#define __get_next(t, insn) \
- ({ t r = *(t*)insn->next_byte; insn->next_byte += sizeof(t); r; })
+ ({ t r = *(t*)insn->next_byte; insn->next_byte += sizeof(t); leXX_to_cpu(t, r); })
#define __peek_nbyte_next(t, insn, n) \
- ({ t r = *(t*)((insn)->next_byte + n); r; })
+ ({ t r = *(t*)((insn)->next_byte + n); leXX_to_cpu(t, r); })
#define get_next(t, insn) \
({ if (unlikely(!validate_next(t, insn, 0))) goto err_out; __get_next(t, insn); })
@@ -147,9 +161,9 @@ found:
b = insn->prefixes.bytes[3];
for (i = 0; i < nb; i++)
if (prefixes->bytes[i] == lb)
- prefixes->bytes[i] = b;
+ insn_set_byte(prefixes, i, b);
}
- insn->prefixes.bytes[3] = lb;
+ insn_set_byte(&insn->prefixes, 3, lb);
}
/* Decode REX prefix */
@@ -157,8 +171,7 @@ found:
b = peek_next(insn_byte_t, insn);
attr = inat_get_opcode_attribute(b);
if (inat_is_rex_prefix(attr)) {
- insn->rex_prefix.value = b;
- insn->rex_prefix.nbytes = 1;
+ insn_field_set(&insn->rex_prefix, b, 1);
insn->next_byte++;
if (X86_REX_W(b))
/* REX.W overrides opnd_size */
@@ -181,13 +194,13 @@ found:
if (X86_MODRM_MOD(b2) != 3)
goto vex_end;
}
- insn->vex_prefix.bytes[0] = b;
- insn->vex_prefix.bytes[1] = b2;
+ insn_set_byte(&insn->vex_prefix, 0, b);
+ insn_set_byte(&insn->vex_prefix, 1, b2);
if (inat_is_evex_prefix(attr)) {
b2 = peek_nbyte_next(insn_byte_t, insn, 2);
- insn->vex_prefix.bytes[2] = b2;
+ insn_set_byte(&insn->vex_prefix, 2, b2);
b2 = peek_nbyte_next(insn_byte_t, insn, 3);
- insn->vex_prefix.bytes[3] = b2;
+ insn_set_byte(&insn->vex_prefix, 3, b2);
insn->vex_prefix.nbytes = 4;
insn->next_byte += 4;
if (insn->x86_64 && X86_VEX_W(b2))
@@ -195,7 +208,7 @@ found:
insn->opnd_bytes = 8;
} else if (inat_is_vex3_prefix(attr)) {
b2 = peek_nbyte_next(insn_byte_t, insn, 2);
- insn->vex_prefix.bytes[2] = b2;
+ insn_set_byte(&insn->vex_prefix, 2, b2);
insn->vex_prefix.nbytes = 3;
insn->next_byte += 3;
if (insn->x86_64 && X86_VEX_W(b2))
@@ -207,7 +220,7 @@ found:
* Makes it easier to decode vex.W, vex.vvvv,
* vex.L and vex.pp. Masking with 0x7f sets vex.W == 0.
*/
- insn->vex_prefix.bytes[2] = b2 & 0x7f;
+ insn_set_byte(&insn->vex_prefix, 2, b2 & 0x7f);
insn->vex_prefix.nbytes = 2;
insn->next_byte += 2;
}
@@ -243,7 +256,7 @@ void insn_get_opcode(struct insn *insn)
/* Get first opcode */
op = get_next(insn_byte_t, insn);
- opcode->bytes[0] = op;
+ insn_set_byte(opcode, 0, op);
opcode->nbytes = 1;
/* Check if there is VEX prefix or not */
@@ -295,8 +308,7 @@ void insn_get_modrm(struct insn *insn)
if (inat_has_modrm(insn->attr)) {
mod = get_next(insn_byte_t, insn);
- modrm->value = mod;
- modrm->nbytes = 1;
+ insn_field_set(modrm, mod, 1);
if (inat_is_group(insn->attr)) {
pfx_id = insn_last_prefix_id(insn);
insn->attr = inat_get_group_attribute(mod, pfx_id,
@@ -334,7 +346,7 @@ int insn_rip_relative(struct insn *insn)
* For rip-relative instructions, the mod field (top 2 bits)
* is zero and the r/m field (bottom 3 bits) is 0x5.
*/
- return (modrm->nbytes && (modrm->value & 0xc7) == 0x5);
+ return (modrm->nbytes && (modrm->bytes[0] & 0xc7) == 0x5);
}
/**
@@ -353,11 +365,11 @@ void insn_get_sib(struct insn *insn)
if (!insn->modrm.got)
insn_get_modrm(insn);
if (insn->modrm.nbytes) {
- modrm = (insn_byte_t)insn->modrm.value;
+ modrm = insn->modrm.bytes[0];
if (insn->addr_bytes != 2 &&
X86_MODRM_MOD(modrm) != 3 && X86_MODRM_RM(modrm) == 4) {
- insn->sib.value = get_next(insn_byte_t, insn);
- insn->sib.nbytes = 1;
+ insn_field_set(&insn->sib,
+ get_next(insn_byte_t, insn), 1);
}
}
insn->sib.got = 1;
@@ -407,19 +419,18 @@ void insn_get_displacement(struct insn *insn)
if (mod == 3)
goto out;
if (mod == 1) {
- insn->displacement.value = get_next(signed char, insn);
- insn->displacement.nbytes = 1;
+ insn_field_set(&insn->displacement,
+ get_next(signed char, insn), 1);
} else if (insn->addr_bytes == 2) {
if ((mod == 0 && rm == 6) || mod == 2) {
- insn->displacement.value =
- get_next(short, insn);
- insn->displacement.nbytes = 2;
+ insn_field_set(&insn->displacement,
+ get_next(short, insn), 2);
}
} else {
if ((mod == 0 && rm == 5) || mod == 2 ||
(mod == 0 && base == 5)) {
- insn->displacement.value = get_next(int, insn);
- insn->displacement.nbytes = 4;
+ insn_field_set(&insn->displacement,
+ get_next(int, insn), 4);
}
}
}
@@ -435,18 +446,14 @@ static int __get_moffset(struct insn *insn)
{
switch (insn->addr_bytes) {
case 2:
- insn->moffset1.value = get_next(short, insn);
- insn->moffset1.nbytes = 2;
+ insn_field_set(&insn->moffset1, get_next(short, insn), 2);
break;
case 4:
- insn->moffset1.value = get_next(int, insn);
- insn->moffset1.nbytes = 4;
+ insn_field_set(&insn->moffset1, get_next(int, insn), 4);
break;
case 8:
- insn->moffset1.value = get_next(int, insn);
- insn->moffset1.nbytes = 4;
- insn->moffset2.value = get_next(int, insn);
- insn->moffset2.nbytes = 4;
+ insn_field_set(&insn->moffset1, get_next(int, insn), 4);
+ insn_field_set(&insn->moffset2, get_next(int, insn), 4);
break;
default: /* opnd_bytes must be modified manually */
goto err_out;
@@ -464,13 +471,11 @@ static int __get_immv32(struct insn *insn)
{
switch (insn->opnd_bytes) {
case 2:
- insn->immediate.value = get_next(short, insn);
- insn->immediate.nbytes = 2;
+ insn_field_set(&insn->immediate, get_next(short, insn), 2);
break;
case 4:
case 8:
- insn->immediate.value = get_next(int, insn);
- insn->immediate.nbytes = 4;
+ insn_field_set(&insn->immediate, get_next(int, insn), 4);
break;
default: /* opnd_bytes must be modified manually */
goto err_out;
@@ -487,18 +492,15 @@ static int __get_immv(struct insn *insn)
{
switch (insn->opnd_bytes) {
case 2:
- insn->immediate1.value = get_next(short, insn);
- insn->immediate1.nbytes = 2;
+ insn_field_set(&insn->immediate1, get_next(short, insn), 2);
break;
case 4:
- insn->immediate1.value = get_next(int, insn);
+ insn_field_set(&insn->immediate1, get_next(int, insn), 4);
insn->immediate1.nbytes = 4;
break;
case 8:
- insn->immediate1.value = get_next(int, insn);
- insn->immediate1.nbytes = 4;
- insn->immediate2.value = get_next(int, insn);
- insn->immediate2.nbytes = 4;
+ insn_field_set(&insn->immediate1, get_next(int, insn), 4);
+ insn_field_set(&insn->immediate2, get_next(int, insn), 4);
break;
default: /* opnd_bytes must be modified manually */
goto err_out;
@@ -515,12 +517,10 @@ static int __get_immptr(struct insn *insn)
{
switch (insn->opnd_bytes) {
case 2:
- insn->immediate1.value = get_next(short, insn);
- insn->immediate1.nbytes = 2;
+ insn_field_set(&insn->immediate1, get_next(short, insn), 2);
break;
case 4:
- insn->immediate1.value = get_next(int, insn);
- insn->immediate1.nbytes = 4;
+ insn_field_set(&insn->immediate1, get_next(int, insn), 4);
break;
case 8:
/* ptr16:64 is not exist (no segment) */
@@ -528,8 +528,7 @@ static int __get_immptr(struct insn *insn)
default: /* opnd_bytes must be modified manually */
goto err_out;
}
- insn->immediate2.value = get_next(unsigned short, insn);
- insn->immediate2.nbytes = 2;
+ insn_field_set(&insn->immediate2, get_next(unsigned short, insn), 2);
insn->immediate1.got = insn->immediate2.got = 1;
return 1;
@@ -565,22 +564,17 @@ void insn_get_immediate(struct insn *insn)
switch (inat_immediate_size(insn->attr)) {
case INAT_IMM_BYTE:
- insn->immediate.value = get_next(signed char, insn);
- insn->immediate.nbytes = 1;
+ insn_field_set(&insn->immediate, get_next(signed char, insn), 1);
break;
case INAT_IMM_WORD:
- insn->immediate.value = get_next(short, insn);
- insn->immediate.nbytes = 2;
+ insn_field_set(&insn->immediate, get_next(short, insn), 2);
break;
case INAT_IMM_DWORD:
- insn->immediate.value = get_next(int, insn);
- insn->immediate.nbytes = 4;
+ insn_field_set(&insn->immediate, get_next(int, insn), 4);
break;
case INAT_IMM_QWORD:
- insn->immediate1.value = get_next(int, insn);
- insn->immediate1.nbytes = 4;
- insn->immediate2.value = get_next(int, insn);
- insn->immediate2.nbytes = 4;
+ insn_field_set(&insn->immediate1, get_next(int, insn), 4);
+ insn_field_set(&insn->immediate2, get_next(int, insn), 4);
break;
case INAT_IMM_PTR:
if (!__get_immptr(insn))
@@ -599,8 +593,7 @@ void insn_get_immediate(struct insn *insn)
goto err_out;
}
if (inat_has_second_immediate(insn->attr)) {
- insn->immediate2.value = get_next(signed char, insn);
- insn->immediate2.nbytes = 1;
+ insn_field_set(&insn->immediate2, get_next(signed char, insn), 1);
}
done:
insn->immediate.got = 1;
diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S
index b4c43a9b14836..f6fb1d218dccf 100644
--- a/arch/x86/lib/retpoline.S
+++ b/arch/x86/lib/retpoline.S
@@ -28,7 +28,7 @@ SYM_FUNC_START_NOALIGN(__x86_retpoline_\reg)
jmp .Lspec_trap_\@
.Ldo_rop_\@:
mov %\reg, (%_ASM_SP)
- UNWIND_HINT_RET_OFFSET
+ UNWIND_HINT_FUNC
ret
SYM_FUNC_END(__x86_retpoline_\reg)
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index f1f1b5a0956a0..525197381baa8 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -16,7 +16,7 @@
#include <linux/prefetch.h> /* prefetchw */
#include <linux/context_tracking.h> /* exception_enter(), ... */
#include <linux/uaccess.h> /* faulthandler_disabled() */
-#include <linux/efi.h> /* efi_recover_from_page_fault()*/
+#include <linux/efi.h> /* efi_crash_gracefully_on_page_fault()*/
#include <linux/mm_types.h>
#include <asm/cpufeature.h> /* boot_cpu_has, ... */
@@ -25,7 +25,7 @@
#include <asm/vsyscall.h> /* emulate_vsyscall */
#include <asm/vm86.h> /* struct vm86 */
#include <asm/mmu_context.h> /* vma_pkey() */
-#include <asm/efi.h> /* efi_recover_from_page_fault()*/
+#include <asm/efi.h> /* efi_crash_gracefully_on_page_fault()*/
#include <asm/desc.h> /* store_idt(), ... */
#include <asm/cpu_entry_area.h> /* exception stack */
#include <asm/pgtable_areas.h> /* VMALLOC_START, ... */
@@ -54,7 +54,7 @@ kmmio_fault(struct pt_regs *regs, unsigned long addr)
* 32-bit mode:
*
* Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
- * Check that here and ignore it.
+ * Check that here and ignore it. This is AMD erratum #91.
*
* 64-bit mode:
*
@@ -83,11 +83,7 @@ check_prefetch_opcode(struct pt_regs *regs, unsigned char *instr,
#ifdef CONFIG_X86_64
case 0x40:
/*
- * In AMD64 long mode 0x40..0x4F are valid REX prefixes
- * Need to figure out under what instruction mode the
- * instruction was issued. Could check the LDT for lm,
- * but for now it's good enough to assume that long
- * mode only uses well known segments or kernel.
+ * In 64-bit mode 0x40..0x4F are valid REX prefixes
*/
return (!user_mode(regs) || user_64bit_mode(regs));
#endif
@@ -110,6 +106,15 @@ check_prefetch_opcode(struct pt_regs *regs, unsigned char *instr,
}
}
+static bool is_amd_k8_pre_npt(void)
+{
+ struct cpuinfo_x86 *c = &boot_cpu_data;
+
+ return unlikely(IS_ENABLED(CONFIG_CPU_SUP_AMD) &&
+ c->x86_vendor == X86_VENDOR_AMD &&
+ c->x86 == 0xf && c->x86_model < 0x40);
+}
+
static int
is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)
{
@@ -117,6 +122,10 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)
unsigned char *instr;
int prefetch = 0;
+ /* Erratum #91 affects AMD K8, pre-NPT CPUs */
+ if (!is_amd_k8_pre_npt())
+ return 0;
+
/*
* If it was a exec (instruction fetch) fault on NX page, then
* do not ignore the fault:
@@ -127,20 +136,31 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)
instr = (void *)convert_ip_to_linear(current, regs);
max_instr = instr + 15;
- if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE_MAX)
- return 0;
+ /*
+ * This code has historically always bailed out if IP points to a
+ * not-present page (e.g. due to a race). No one has ever
+ * complained about this.
+ */
+ pagefault_disable();
while (instr < max_instr) {
unsigned char opcode;
- if (get_kernel_nofault(opcode, instr))
- break;
+ if (user_mode(regs)) {
+ if (get_user(opcode, instr))
+ break;
+ } else {
+ if (get_kernel_nofault(opcode, instr))
+ break;
+ }
instr++;
if (!check_prefetch_opcode(regs, instr, opcode, &prefetch))
break;
}
+
+ pagefault_enable();
return prefetch;
}
@@ -262,25 +282,6 @@ void arch_sync_kernel_mappings(unsigned long start, unsigned long end)
}
}
-/*
- * Did it hit the DOS screen memory VA from vm86 mode?
- */
-static inline void
-check_v8086_mode(struct pt_regs *regs, unsigned long address,
- struct task_struct *tsk)
-{
-#ifdef CONFIG_VM86
- unsigned long bit;
-
- if (!v8086_mode(regs) || !tsk->thread.vm86)
- return;
-
- bit = (address - 0xA0000) >> PAGE_SHIFT;
- if (bit < 32)
- tsk->thread.vm86->screen_bitmap |= 1 << bit;
-#endif
-}
-
static bool low_pfn(unsigned long pfn)
{
return pfn < max_low_pfn;
@@ -335,15 +336,6 @@ KERN_ERR
"******* Disabling USB legacy in the BIOS may also help.\n";
#endif
-/*
- * No vm86 mode in 64-bit mode:
- */
-static inline void
-check_v8086_mode(struct pt_regs *regs, unsigned long address,
- struct task_struct *tsk)
-{
-}
-
static int bad_address(void *p)
{
unsigned long dummy;
@@ -427,6 +419,9 @@ static int is_errata93(struct pt_regs *regs, unsigned long address)
|| boot_cpu_data.x86 != 0xf)
return 0;
+ if (user_mode(regs))
+ return 0;
+
if (address != regs->ip)
return 0;
@@ -462,10 +457,12 @@ static int is_errata100(struct pt_regs *regs, unsigned long address)
}
/* Pentium F0 0F C7 C8 bug workaround: */
-static int is_f00f_bug(struct pt_regs *regs, unsigned long address)
+static int is_f00f_bug(struct pt_regs *regs, unsigned long error_code,
+ unsigned long address)
{
#ifdef CONFIG_X86_F00F_BUG
- if (boot_cpu_has_bug(X86_BUG_F00F) && idt_is_f00f_address(address)) {
+ if (boot_cpu_has_bug(X86_BUG_F00F) && !(error_code & X86_PF_USER) &&
+ idt_is_f00f_address(address)) {
handle_invalid_op(regs);
return 1;
}
@@ -630,53 +627,20 @@ static void set_signal_archinfo(unsigned long address,
}
static noinline void
-no_context(struct pt_regs *regs, unsigned long error_code,
- unsigned long address, int signal, int si_code)
+page_fault_oops(struct pt_regs *regs, unsigned long error_code,
+ unsigned long address)
{
- struct task_struct *tsk = current;
unsigned long flags;
int sig;
if (user_mode(regs)) {
/*
- * This is an implicit supervisor-mode access from user
- * mode. Bypass all the kernel-mode recovery code and just
- * OOPS.
+ * Implicit kernel access from user mode? Skip the stack
+ * overflow and EFI special cases.
*/
goto oops;
}
- /* Are we prepared to handle this kernel fault? */
- if (fixup_exception(regs, X86_TRAP_PF, error_code, address)) {
- /*
- * Any interrupt that takes a fault gets the fixup. This makes
- * the below recursive fault logic only apply to a faults from
- * task context.
- */
- if (in_interrupt())
- return;
-
- /*
- * Per the above we're !in_interrupt(), aka. task context.
- *
- * In this case we need to make sure we're not recursively
- * faulting through the emulate_vsyscall() logic.
- */
- if (current->thread.sig_on_uaccess_err && signal) {
- sanitize_error_code(address, &error_code);
-
- set_signal_archinfo(address, error_code);
-
- /* XXX: hwpoison faults will set the wrong code. */
- force_sig_fault(signal, si_code, (void __user *)address);
- }
-
- /*
- * Barring that, we can do the fixup and be happy.
- */
- return;
- }
-
#ifdef CONFIG_VMAP_STACK
/*
* Stack overflow? During boot, we can fault near the initial
@@ -684,8 +648,8 @@ no_context(struct pt_regs *regs, unsigned long error_code,
* that we're in vmalloc space to avoid this.
*/
if (is_vmalloc_addr((void *)address) &&
- (((unsigned long)tsk->stack - 1 - address < PAGE_SIZE) ||
- address - ((unsigned long)tsk->stack + THREAD_SIZE) < PAGE_SIZE)) {
+ (((unsigned long)current->stack - 1 - address < PAGE_SIZE) ||
+ address - ((unsigned long)current->stack + THREAD_SIZE) < PAGE_SIZE)) {
unsigned long stack = __this_cpu_ist_top_va(DF) - sizeof(void *);
/*
* We're likely to be running with very little stack space
@@ -709,28 +673,12 @@ no_context(struct pt_regs *regs, unsigned long error_code,
#endif
/*
- * 32-bit:
- *
- * Valid to do another page fault here, because if this fault
- * had been triggered by is_prefetch fixup_exception would have
- * handled it.
- *
- * 64-bit:
- *
- * Hall of shame of CPU/BIOS bugs.
- */
- if (is_prefetch(regs, error_code, address))
- return;
-
- if (is_errata93(regs, address))
- return;
-
- /*
- * Buggy firmware could access regions which might page fault, try to
- * recover from such faults.
+ * Buggy firmware could access regions which might page fault. If
+ * this happens, EFI has a special OOPS path that will try to
+ * avoid hanging the system.
*/
if (IS_ENABLED(CONFIG_EFI))
- efi_recover_from_page_fault(address);
+ efi_crash_gracefully_on_page_fault(address);
oops:
/*
@@ -741,7 +689,7 @@ oops:
show_fault_oops(regs, error_code, address);
- if (task_stack_end_corrupted(tsk))
+ if (task_stack_end_corrupted(current))
printk(KERN_EMERG "Thread overran stack, or stack corrupted\n");
sig = SIGKILL;
@@ -754,6 +702,53 @@ oops:
oops_end(flags, regs, sig);
}
+static noinline void
+kernelmode_fixup_or_oops(struct pt_regs *regs, unsigned long error_code,
+ unsigned long address, int signal, int si_code)
+{
+ WARN_ON_ONCE(user_mode(regs));
+
+ /* Are we prepared to handle this kernel fault? */
+ if (fixup_exception(regs, X86_TRAP_PF, error_code, address)) {
+ /*
+ * Any interrupt that takes a fault gets the fixup. This makes
+ * the below recursive fault logic only apply to a faults from
+ * task context.
+ */
+ if (in_interrupt())
+ return;
+
+ /*
+ * Per the above we're !in_interrupt(), aka. task context.
+ *
+ * In this case we need to make sure we're not recursively
+ * faulting through the emulate_vsyscall() logic.
+ */
+ if (current->thread.sig_on_uaccess_err && signal) {
+ sanitize_error_code(address, &error_code);
+
+ set_signal_archinfo(address, error_code);
+
+ /* XXX: hwpoison faults will set the wrong code. */
+ force_sig_fault(signal, si_code, (void __user *)address);
+ }
+
+ /*
+ * Barring that, we can do the fixup and be happy.
+ */
+ return;
+ }
+
+ /*
+ * AMD erratum #91 manifests as a spurious page fault on a PREFETCH
+ * instruction.
+ */
+ if (is_prefetch(regs, error_code, address))
+ return;
+
+ page_fault_oops(regs, error_code, address);
+}
+
/*
* Print out info about fatal segfaults, if the show_unhandled_signals
* sysctl is set:
@@ -796,47 +791,49 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
{
struct task_struct *tsk = current;
- /* User mode accesses just cause a SIGSEGV */
- if (user_mode(regs) && (error_code & X86_PF_USER)) {
- /*
- * It's possible to have interrupts off here:
- */
- local_irq_enable();
-
- /*
- * Valid to do another page fault here because this one came
- * from user space:
- */
- if (is_prefetch(regs, error_code, address))
- return;
+ if (!user_mode(regs)) {
+ kernelmode_fixup_or_oops(regs, error_code, address, pkey, si_code);
+ return;
+ }
- if (is_errata100(regs, address))
- return;
+ if (!(error_code & X86_PF_USER)) {
+ /* Implicit user access to kernel memory -- just oops */
+ page_fault_oops(regs, error_code, address);
+ return;
+ }
- sanitize_error_code(address, &error_code);
+ /*
+ * User mode accesses just cause a SIGSEGV.
+ * It's possible to have interrupts off here:
+ */
+ local_irq_enable();
- if (fixup_vdso_exception(regs, X86_TRAP_PF, error_code, address))
- return;
+ /*
+ * Valid to do another page fault here because this one came
+ * from user space:
+ */
+ if (is_prefetch(regs, error_code, address))
+ return;
- if (likely(show_unhandled_signals))
- show_signal_msg(regs, error_code, address, tsk);
+ if (is_errata100(regs, address))
+ return;
- set_signal_archinfo(address, error_code);
+ sanitize_error_code(address, &error_code);
- if (si_code == SEGV_PKUERR)
- force_sig_pkuerr((void __user *)address, pkey);
+ if (fixup_vdso_exception(regs, X86_TRAP_PF, error_code, address))
+ return;
- force_sig_fault(SIGSEGV, si_code, (void __user *)address);
+ if (likely(show_unhandled_signals))
+ show_signal_msg(regs, error_code, address, tsk);
- local_irq_disable();
+ set_signal_archinfo(address, error_code);
- return;
- }
+ if (si_code == SEGV_PKUERR)
+ force_sig_pkuerr((void __user *)address, pkey);
- if (is_f00f_bug(regs, address))
- return;
+ force_sig_fault(SIGSEGV, si_code, (void __user *)address);
- no_context(regs, error_code, address, SIGSEGV, si_code);
+ local_irq_disable();
}
static noinline void
@@ -926,8 +923,8 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
vm_fault_t fault)
{
/* Kernel mode? Handle exceptions or die: */
- if (!(error_code & X86_PF_USER)) {
- no_context(regs, error_code, address, SIGBUS, BUS_ADRERR);
+ if (!user_mode(regs)) {
+ kernelmode_fixup_or_oops(regs, error_code, address, SIGBUS, BUS_ADRERR);
return;
}
@@ -961,40 +958,6 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address);
}
-static noinline void
-mm_fault_error(struct pt_regs *regs, unsigned long error_code,
- unsigned long address, vm_fault_t fault)
-{
- if (fatal_signal_pending(current) && !(error_code & X86_PF_USER)) {
- no_context(regs, error_code, address, 0, 0);
- return;
- }
-
- if (fault & VM_FAULT_OOM) {
- /* Kernel mode? Handle exceptions or die: */
- if (!(error_code & X86_PF_USER)) {
- no_context(regs, error_code, address,
- SIGSEGV, SEGV_MAPERR);
- return;
- }
-
- /*
- * We ran out of memory, call the OOM killer, and return the
- * userspace (which will retry the fault, or kill us if we got
- * oom-killed):
- */
- pagefault_out_of_memory();
- } else {
- if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|
- VM_FAULT_HWPOISON_LARGE))
- do_sigbus(regs, error_code, address, fault);
- else if (fault & VM_FAULT_SIGSEGV)
- bad_area_nosemaphore(regs, error_code, address);
- else
- BUG();
- }
-}
-
static int spurious_kernel_fault_check(unsigned long error_code, pte_t *pte)
{
if ((error_code & X86_PF_WRITE) && !pte_write(*pte))
@@ -1209,6 +1172,9 @@ do_kern_addr_fault(struct pt_regs *regs, unsigned long hw_error_code,
}
#endif
+ if (is_f00f_bug(regs, hw_error_code, address))
+ return;
+
/* Was the fault spurious, caused by lazy TLB invalidation? */
if (spurious_kernel_fault(hw_error_code, address))
return;
@@ -1229,10 +1195,17 @@ do_kern_addr_fault(struct pt_regs *regs, unsigned long hw_error_code,
}
NOKPROBE_SYMBOL(do_kern_addr_fault);
-/* Handle faults in the user portion of the address space */
+/*
+ * Handle faults in the user portion of the address space. Nothing in here
+ * should check X86_PF_USER without a specific justification: for almost
+ * all purposes, we should treat a normal kernel access to user memory
+ * (e.g. get_user(), put_user(), etc.) the same as the WRUSS instruction.
+ * The one exception is AC flag handling, which is, per the x86
+ * architecture, special for WRUSS.
+ */
static inline
void do_user_addr_fault(struct pt_regs *regs,
- unsigned long hw_error_code,
+ unsigned long error_code,
unsigned long address)
{
struct vm_area_struct *vma;
@@ -1244,6 +1217,21 @@ void do_user_addr_fault(struct pt_regs *regs,
tsk = current;
mm = tsk->mm;
+ if (unlikely((error_code & (X86_PF_USER | X86_PF_INSTR)) == X86_PF_INSTR)) {
+ /*
+ * Whoops, this is kernel mode code trying to execute from
+ * user memory. Unless this is AMD erratum #93, which
+ * corrupts RIP such that it looks like a user address,
+ * this is unrecoverable. Don't even try to look up the
+ * VMA or look for extable entries.
+ */
+ if (is_errata93(regs, address))
+ return;
+
+ page_fault_oops(regs, error_code, address);
+ return;
+ }
+
/* kprobes don't want to hook the spurious faults: */
if (unlikely(kprobe_page_fault(regs, X86_TRAP_PF)))
return;
@@ -1252,8 +1240,8 @@ void do_user_addr_fault(struct pt_regs *regs,
* Reserved bits are never expected to be set on
* entries in the user portion of the page tables.
*/
- if (unlikely(hw_error_code & X86_PF_RSVD))
- pgtable_bad(regs, hw_error_code, address);
+ if (unlikely(error_code & X86_PF_RSVD))
+ pgtable_bad(regs, error_code, address);
/*
* If SMAP is on, check for invalid kernel (supervisor) access to user
@@ -1263,10 +1251,13 @@ void do_user_addr_fault(struct pt_regs *regs,
* enforcement appears to be consistent with the USER bit.
*/
if (unlikely(cpu_feature_enabled(X86_FEATURE_SMAP) &&
- !(hw_error_code & X86_PF_USER) &&
- !(regs->flags & X86_EFLAGS_AC)))
- {
- bad_area_nosemaphore(regs, hw_error_code, address);
+ !(error_code & X86_PF_USER) &&
+ !(regs->flags & X86_EFLAGS_AC))) {
+ /*
+ * No extable entry here. This was a kernel access to an
+ * invalid pointer. get_kernel_nofault() will not get here.
+ */
+ page_fault_oops(regs, error_code, address);
return;
}
@@ -1275,7 +1266,7 @@ void do_user_addr_fault(struct pt_regs *regs,
* in a region with pagefaults disabled then we must not take the fault
*/
if (unlikely(faulthandler_disabled() || !mm)) {
- bad_area_nosemaphore(regs, hw_error_code, address);
+ bad_area_nosemaphore(regs, error_code, address);
return;
}
@@ -1296,9 +1287,9 @@ void do_user_addr_fault(struct pt_regs *regs,
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
- if (hw_error_code & X86_PF_WRITE)
+ if (error_code & X86_PF_WRITE)
flags |= FAULT_FLAG_WRITE;
- if (hw_error_code & X86_PF_INSTR)
+ if (error_code & X86_PF_INSTR)
flags |= FAULT_FLAG_INSTRUCTION;
#ifdef CONFIG_X86_64
@@ -1314,7 +1305,7 @@ void do_user_addr_fault(struct pt_regs *regs,
* to consider the PF_PK bit.
*/
if (is_vsyscall_vaddr(address)) {
- if (emulate_vsyscall(hw_error_code, regs, address))
+ if (emulate_vsyscall(error_code, regs, address))
return;
}
#endif
@@ -1337,7 +1328,7 @@ void do_user_addr_fault(struct pt_regs *regs,
* Fault from code in kernel from
* which we do not expect faults.
*/
- bad_area_nosemaphore(regs, hw_error_code, address);
+ bad_area_nosemaphore(regs, error_code, address);
return;
}
retry:
@@ -1353,17 +1344,17 @@ retry:
vma = find_vma(mm, address);
if (unlikely(!vma)) {
- bad_area(regs, hw_error_code, address);
+ bad_area(regs, error_code, address);
return;
}
if (likely(vma->vm_start <= address))
goto good_area;
if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) {
- bad_area(regs, hw_error_code, address);
+ bad_area(regs, error_code, address);
return;
}
if (unlikely(expand_stack(vma, address))) {
- bad_area(regs, hw_error_code, address);
+ bad_area(regs, error_code, address);
return;
}
@@ -1372,8 +1363,8 @@ retry:
* we can handle it..
*/
good_area:
- if (unlikely(access_error(hw_error_code, vma))) {
- bad_area_access_error(regs, hw_error_code, address, vma);
+ if (unlikely(access_error(error_code, vma))) {
+ bad_area_access_error(regs, error_code, address, vma);
return;
}
@@ -1392,11 +1383,14 @@ good_area:
*/
fault = handle_mm_fault(vma, address, flags, regs);
- /* Quick path to respond to signals */
if (fault_signal_pending(fault, regs)) {
+ /*
+ * Quick path to respond to signals. The core mm code
+ * has unlocked the mm for us if we get here.
+ */
if (!user_mode(regs))
- no_context(regs, hw_error_code, address, SIGBUS,
- BUS_ADRERR);
+ kernelmode_fixup_or_oops(regs, error_code, address,
+ SIGBUS, BUS_ADRERR);
return;
}
@@ -1412,12 +1406,37 @@ good_area:
}
mmap_read_unlock(mm);
- if (unlikely(fault & VM_FAULT_ERROR)) {
- mm_fault_error(regs, hw_error_code, address, fault);
+ if (likely(!(fault & VM_FAULT_ERROR)))
+ return;
+
+ if (fatal_signal_pending(current) && !user_mode(regs)) {
+ kernelmode_fixup_or_oops(regs, error_code, address, 0, 0);
return;
}
- check_v8086_mode(regs, address, tsk);
+ if (fault & VM_FAULT_OOM) {
+ /* Kernel mode? Handle exceptions or die: */
+ if (!user_mode(regs)) {
+ kernelmode_fixup_or_oops(regs, error_code, address,
+ SIGSEGV, SEGV_MAPERR);
+ return;
+ }
+
+ /*
+ * We ran out of memory, call the OOM killer, and return the
+ * userspace (which will retry the fault, or kill us if we got
+ * oom-killed):
+ */
+ pagefault_out_of_memory();
+ } else {
+ if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|
+ VM_FAULT_HWPOISON_LARGE))
+ do_sigbus(regs, error_code, address, fault);
+ else if (fault & VM_FAULT_SIGSEGV)
+ bad_area_nosemaphore(regs, error_code, address);
+ else
+ BUG();
+ }
}
NOKPROBE_SYMBOL(do_user_addr_fault);
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index e26f5c5c6565a..dd694fb939169 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -157,16 +157,25 @@ __ref void *alloc_low_pages(unsigned int num)
}
/*
- * By default need 3 4k for initial PMD_SIZE, 3 4k for 0-ISA_END_ADDRESS.
- * With KASLR memory randomization, depending on the machine e820 memory
- * and the PUD alignment. We may need twice more pages when KASLR memory
+ * By default need to be able to allocate page tables below PGD firstly for
+ * the 0-ISA_END_ADDRESS range and secondly for the initial PMD_SIZE mapping.
+ * With KASLR memory randomization, depending on the machine e820 memory and the
+ * PUD alignment, twice that many pages may be needed when KASLR memory
* randomization is enabled.
*/
+
+#ifndef CONFIG_X86_5LEVEL
+#define INIT_PGD_PAGE_TABLES 3
+#else
+#define INIT_PGD_PAGE_TABLES 4
+#endif
+
#ifndef CONFIG_RANDOMIZE_MEMORY
-#define INIT_PGD_PAGE_COUNT 6
+#define INIT_PGD_PAGE_COUNT (2 * INIT_PGD_PAGE_TABLES)
#else
-#define INIT_PGD_PAGE_COUNT 12
+#define INIT_PGD_PAGE_COUNT (4 * INIT_PGD_PAGE_TABLES)
#endif
+
#define INIT_PGT_BUF_SIZE (INIT_PGD_PAGE_COUNT * PAGE_SIZE)
RESERVE_BRK(early_pgt_alloc, INIT_PGT_BUF_SIZE);
void __init early_alloc_pgt_buf(void)
diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c
index c3d5f0236f353..4b01f7dbaf303 100644
--- a/arch/x86/mm/mem_encrypt.c
+++ b/arch/x86/mm/mem_encrypt.c
@@ -475,9 +475,10 @@ void __init mem_encrypt_init(void)
swiotlb_update_mem_attributes();
/*
- * With SEV, we need to unroll the rep string I/O instructions.
+ * With SEV, we need to unroll the rep string I/O instructions,
+ * but SEV-ES supports them through the #VC handler.
*/
- if (sev_active())
+ if (sev_active() && !sev_es_active())
static_branch_enable(&sev_enable_key);
print_mem_encrypt_feature_info();
diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c
index bd7aff5c51f77..cd768dafca9e9 100644
--- a/arch/x86/mm/mmio-mod.c
+++ b/arch/x86/mm/mmio-mod.c
@@ -10,8 +10,6 @@
#define pr_fmt(fmt) "mmiotrace: " fmt
-#define DEBUG 1
-
#include <linux/moduleparam.h>
#include <linux/debugfs.h>
#include <linux/slab.h>
diff --git a/arch/x86/pci/init.c b/arch/x86/pci/init.c
index 00bfa1ebad6c7..0bb3b8b44e4e2 100644
--- a/arch/x86/pci/init.c
+++ b/arch/x86/pci/init.c
@@ -9,16 +9,23 @@
in the right sequence from here. */
static __init int pci_arch_init(void)
{
- int type;
-
- x86_create_pci_msi_domain();
+ int type, pcbios = 1;
type = pci_direct_probe();
if (!(pci_probe & PCI_PROBE_NOEARLY))
pci_mmcfg_early_init();
- if (x86_init.pci.arch_init && !x86_init.pci.arch_init())
+ if (x86_init.pci.arch_init)
+ pcbios = x86_init.pci.arch_init();
+
+ /*
+ * Must happen after x86_init.pci.arch_init(). Xen sets up the
+ * x86_init.irqs.create_pci_msi_domain there.
+ */
+ x86_create_pci_msi_domain();
+
+ if (!pcbios)
return 0;
pci_pcbios_init();
diff --git a/arch/x86/platform/Makefile b/arch/x86/platform/Makefile
index d0e835470d01a..b2f90a1a89f10 100644
--- a/arch/x86/platform/Makefile
+++ b/arch/x86/platform/Makefile
@@ -4,7 +4,6 @@ obj-y += atom/
obj-y += ce4100/
obj-y += efi/
obj-y += geode/
-obj-y += goldfish/
obj-y += iris/
obj-y += intel/
obj-y += intel-mid/
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index 8efd003540cae..1b82d77019b17 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -54,10 +54,7 @@
* 0xffff_ffff_0000_0000 and limit EFI VA mapping space to 64G.
*/
static u64 efi_va = EFI_VA_START;
-
-struct efi_scratch efi_scratch;
-
-EXPORT_SYMBOL_GPL(efi_mm);
+static struct mm_struct *efi_prev_mm;
/*
* We need our own copy of the higher levels of the page tables
@@ -237,7 +234,7 @@ int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages)
return 1;
}
- efi_scratch.phys_stack = page_to_phys(page + 1); /* stack grows down */
+ efi_mixed_mode_stack_pa = page_to_phys(page + 1); /* stack grows down */
npages = (_etext - _text) >> PAGE_SHIFT;
text = __pa(_text);
@@ -462,11 +459,17 @@ void __init efi_dump_pagetable(void)
* can not change under us.
* It should be ensured that there are no concurent calls to this function.
*/
-void efi_switch_mm(struct mm_struct *mm)
+void efi_enter_mm(void)
+{
+ efi_prev_mm = current->active_mm;
+ current->active_mm = &efi_mm;
+ switch_mm(efi_prev_mm, &efi_mm, NULL);
+}
+
+void efi_leave_mm(void)
{
- efi_scratch.prev_mm = current->active_mm;
- current->active_mm = mm;
- switch_mm(efi_scratch.prev_mm, mm, NULL);
+ current->active_mm = efi_prev_mm;
+ switch_mm(&efi_mm, efi_prev_mm, NULL);
}
static DEFINE_SPINLOCK(efi_runtime_lock);
@@ -530,12 +533,12 @@ efi_thunk_set_virtual_address_map(unsigned long memory_map_size,
efi_sync_low_kernel_mappings();
local_irq_save(flags);
- efi_switch_mm(&efi_mm);
+ efi_enter_mm();
status = __efi_thunk(set_virtual_address_map, memory_map_size,
descriptor_size, descriptor_version, virtual_map);
- efi_switch_mm(efi_scratch.prev_mm);
+ efi_leave_mm();
local_irq_restore(flags);
return status;
@@ -829,9 +832,9 @@ efi_set_virtual_address_map(unsigned long memory_map_size,
descriptor_size,
descriptor_version,
virtual_map);
- efi_switch_mm(&efi_mm);
+ efi_enter_mm();
- kernel_fpu_begin();
+ efi_fpu_begin();
/* Disable interrupts around EFI calls: */
local_irq_save(flags);
@@ -840,12 +843,12 @@ efi_set_virtual_address_map(unsigned long memory_map_size,
descriptor_version, virtual_map);
local_irq_restore(flags);
- kernel_fpu_end();
+ efi_fpu_end();
/* grab the virtually remapped EFI runtime services table pointer */
efi.runtime = READ_ONCE(systab->runtime);
- efi_switch_mm(efi_scratch.prev_mm);
+ efi_leave_mm();
return status;
}
diff --git a/arch/x86/platform/efi/efi_thunk_64.S b/arch/x86/platform/efi/efi_thunk_64.S
index 26f0da238c1ca..fd3dd1708eba5 100644
--- a/arch/x86/platform/efi/efi_thunk_64.S
+++ b/arch/x86/platform/efi/efi_thunk_64.S
@@ -33,7 +33,7 @@ SYM_CODE_START(__efi64_thunk)
* Switch to 1:1 mapped 32-bit stack pointer.
*/
movq %rsp, %rax
- movq efi_scratch(%rip), %rsp
+ movq efi_mixed_mode_stack_pa(%rip), %rsp
push %rax
/*
@@ -70,3 +70,7 @@ SYM_CODE_START(__efi64_thunk)
pushl %ebp
lret
SYM_CODE_END(__efi64_thunk)
+
+ .bss
+ .balign 8
+SYM_DATA(efi_mixed_mode_stack_pa, .quad 0)
diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c
index 5a40fe411ebda..67d93a243c353 100644
--- a/arch/x86/platform/efi/quirks.c
+++ b/arch/x86/platform/efi/quirks.c
@@ -687,15 +687,25 @@ int efi_capsule_setup_info(struct capsule_info *cap_info, void *kbuff,
* @return: Returns, if the page fault is not handled. This function
* will never return if the page fault is handled successfully.
*/
-void efi_recover_from_page_fault(unsigned long phys_addr)
+void efi_crash_gracefully_on_page_fault(unsigned long phys_addr)
{
if (!IS_ENABLED(CONFIG_X86_64))
return;
/*
+ * If we get an interrupt/NMI while processing an EFI runtime service
+ * then this is a regular OOPS, not an EFI failure.
+ */
+ if (in_interrupt())
+ return;
+
+ /*
* Make sure that an efi runtime service caused the page fault.
+ * READ_ONCE() because we might be OOPSing in a different thread,
+ * and we don't want to trip KTSAN while trying to OOPS.
*/
- if (efi_rts_work.efi_rts_id == EFI_NONE)
+ if (READ_ONCE(efi_rts_work.efi_rts_id) == EFI_NONE ||
+ current_work() != &efi_rts_work.work)
return;
/*
@@ -747,6 +757,4 @@ void efi_recover_from_page_fault(unsigned long phys_addr)
set_current_state(TASK_IDLE);
schedule();
}
-
- return;
}
diff --git a/arch/x86/platform/geode/alix.c b/arch/x86/platform/geode/alix.c
index c33f744b53882..b39bf3b5e108c 100644
--- a/arch/x86/platform/geode/alix.c
+++ b/arch/x86/platform/geode/alix.c
@@ -22,6 +22,7 @@
#include <linux/platform_device.h>
#include <linux/input.h>
#include <linux/gpio_keys.h>
+#include <linux/gpio/machine.h>
#include <linux/dmi.h>
#include <asm/geode.h>
@@ -69,21 +70,15 @@ static struct platform_device alix_buttons_dev = {
static struct gpio_led alix_leds[] = {
{
.name = "alix:1",
- .gpio = 6,
.default_trigger = "default-on",
- .active_low = 1,
},
{
.name = "alix:2",
- .gpio = 25,
.default_trigger = "default-off",
- .active_low = 1,
},
{
.name = "alix:3",
- .gpio = 27,
.default_trigger = "default-off",
- .active_low = 1,
},
};
@@ -92,6 +87,17 @@ static struct gpio_led_platform_data alix_leds_data = {
.leds = alix_leds,
};
+static struct gpiod_lookup_table alix_leds_gpio_table = {
+ .dev_id = "leds-gpio",
+ .table = {
+ /* The Geode GPIOs should be on the CS5535 companion chip */
+ GPIO_LOOKUP_IDX("cs5535-gpio", 6, NULL, 0, GPIO_ACTIVE_LOW),
+ GPIO_LOOKUP_IDX("cs5535-gpio", 25, NULL, 1, GPIO_ACTIVE_LOW),
+ GPIO_LOOKUP_IDX("cs5535-gpio", 27, NULL, 2, GPIO_ACTIVE_LOW),
+ { }
+ },
+};
+
static struct platform_device alix_leds_dev = {
.name = "leds-gpio",
.id = -1,
@@ -106,6 +112,7 @@ static struct platform_device *alix_devs[] __initdata = {
static void __init register_alix(void)
{
/* Setup LED control through leds-gpio driver */
+ gpiod_add_lookup_table(&alix_leds_gpio_table);
platform_add_devices(alix_devs, ARRAY_SIZE(alix_devs));
}
diff --git a/arch/x86/platform/geode/geos.c b/arch/x86/platform/geode/geos.c
index 73a3f49b4eb63..d263528c90bbf 100644
--- a/arch/x86/platform/geode/geos.c
+++ b/arch/x86/platform/geode/geos.c
@@ -20,6 +20,7 @@
#include <linux/platform_device.h>
#include <linux/input.h>
#include <linux/gpio_keys.h>
+#include <linux/gpio/machine.h>
#include <linux/dmi.h>
#include <asm/geode.h>
@@ -53,21 +54,15 @@ static struct platform_device geos_buttons_dev = {
static struct gpio_led geos_leds[] = {
{
.name = "geos:1",
- .gpio = 6,
.default_trigger = "default-on",
- .active_low = 1,
},
{
.name = "geos:2",
- .gpio = 25,
.default_trigger = "default-off",
- .active_low = 1,
},
{
.name = "geos:3",
- .gpio = 27,
.default_trigger = "default-off",
- .active_low = 1,
},
};
@@ -76,6 +71,17 @@ static struct gpio_led_platform_data geos_leds_data = {
.leds = geos_leds,
};
+static struct gpiod_lookup_table geos_leds_gpio_table = {
+ .dev_id = "leds-gpio",
+ .table = {
+ /* The Geode GPIOs should be on the CS5535 companion chip */
+ GPIO_LOOKUP_IDX("cs5535-gpio", 6, NULL, 0, GPIO_ACTIVE_LOW),
+ GPIO_LOOKUP_IDX("cs5535-gpio", 25, NULL, 1, GPIO_ACTIVE_LOW),
+ GPIO_LOOKUP_IDX("cs5535-gpio", 27, NULL, 2, GPIO_ACTIVE_LOW),
+ { }
+ },
+};
+
static struct platform_device geos_leds_dev = {
.name = "leds-gpio",
.id = -1,
@@ -90,6 +96,7 @@ static struct platform_device *geos_devs[] __initdata = {
static void __init register_geos(void)
{
/* Setup LED control through leds-gpio driver */
+ gpiod_add_lookup_table(&geos_leds_gpio_table);
platform_add_devices(geos_devs, ARRAY_SIZE(geos_devs));
}
diff --git a/arch/x86/platform/geode/net5501.c b/arch/x86/platform/geode/net5501.c
index 163e1b5455170..558384acd7776 100644
--- a/arch/x86/platform/geode/net5501.c
+++ b/arch/x86/platform/geode/net5501.c
@@ -20,6 +20,7 @@
#include <linux/platform_device.h>
#include <linux/input.h>
#include <linux/gpio_keys.h>
+#include <linux/gpio/machine.h>
#include <asm/geode.h>
@@ -55,9 +56,7 @@ static struct platform_device net5501_buttons_dev = {
static struct gpio_led net5501_leds[] = {
{
.name = "net5501:1",
- .gpio = 6,
.default_trigger = "default-on",
- .active_low = 0,
},
};
@@ -66,6 +65,15 @@ static struct gpio_led_platform_data net5501_leds_data = {
.leds = net5501_leds,
};
+static struct gpiod_lookup_table net5501_leds_gpio_table = {
+ .dev_id = "leds-gpio",
+ .table = {
+ /* The Geode GPIOs should be on the CS5535 companion chip */
+ GPIO_LOOKUP_IDX("cs5535-gpio", 6, NULL, 0, GPIO_ACTIVE_HIGH),
+ { }
+ },
+};
+
static struct platform_device net5501_leds_dev = {
.name = "leds-gpio",
.id = -1,
@@ -80,6 +88,7 @@ static struct platform_device *net5501_devs[] __initdata = {
static void __init register_net5501(void)
{
/* Setup LED control through leds-gpio driver */
+ gpiod_add_lookup_table(&net5501_leds_gpio_table);
platform_add_devices(net5501_devs, ARRAY_SIZE(net5501_devs));
}
diff --git a/arch/x86/platform/goldfish/Makefile b/arch/x86/platform/goldfish/Makefile
deleted file mode 100644
index 072c395379aca..0000000000000
--- a/arch/x86/platform/goldfish/Makefile
+++ /dev/null
@@ -1,2 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-obj-$(CONFIG_GOLDFISH) += goldfish.o
diff --git a/arch/x86/platform/goldfish/goldfish.c b/arch/x86/platform/goldfish/goldfish.c
deleted file mode 100644
index 6b6f8b4360dd4..0000000000000
--- a/arch/x86/platform/goldfish/goldfish.c
+++ /dev/null
@@ -1,54 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright (C) 2007 Google, Inc.
- * Copyright (C) 2011 Intel, Inc.
- * Copyright (C) 2013 Intel, Inc.
- */
-
-#include <linux/kernel.h>
-#include <linux/irq.h>
-#include <linux/platform_device.h>
-
-/*
- * Where in virtual device memory the IO devices (timers, system controllers
- * and so on)
- */
-
-#define GOLDFISH_PDEV_BUS_BASE (0xff001000)
-#define GOLDFISH_PDEV_BUS_END (0xff7fffff)
-#define GOLDFISH_PDEV_BUS_IRQ (4)
-
-#define GOLDFISH_TTY_BASE (0x2000)
-
-static struct resource goldfish_pdev_bus_resources[] = {
- {
- .start = GOLDFISH_PDEV_BUS_BASE,
- .end = GOLDFISH_PDEV_BUS_END,
- .flags = IORESOURCE_MEM,
- },
- {
- .start = GOLDFISH_PDEV_BUS_IRQ,
- .end = GOLDFISH_PDEV_BUS_IRQ,
- .flags = IORESOURCE_IRQ,
- }
-};
-
-static bool goldfish_enable __initdata;
-
-static int __init goldfish_setup(char *str)
-{
- goldfish_enable = true;
- return 0;
-}
-__setup("goldfish", goldfish_setup);
-
-static int __init goldfish_init(void)
-{
- if (!goldfish_enable)
- return -ENODEV;
-
- platform_device_register_simple("goldfish_pdev_bus", -1,
- goldfish_pdev_bus_resources, 2);
- return 0;
-}
-device_initcall(goldfish_init);
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_bt.c b/arch/x86/platform/intel-mid/device_libs/platform_bt.c
index 31dda18bb3700..2930b6e9473e1 100644
--- a/arch/x86/platform/intel-mid/device_libs/platform_bt.c
+++ b/arch/x86/platform/intel-mid/device_libs/platform_bt.c
@@ -88,8 +88,8 @@ static int __init bt_sfi_init(void)
memset(&info, 0, sizeof(info));
info.fwnode = ddata->dev->fwnode;
info.parent = ddata->dev;
- info.name = ddata->name,
- info.id = PLATFORM_DEVID_NONE,
+ info.name = ddata->name;
+ info.id = PLATFORM_DEVID_NONE;
pdev = platform_device_register_full(&info);
if (IS_ERR(pdev))
diff --git a/arch/x86/platform/pvh/head.S b/arch/x86/platform/pvh/head.S
index 43b4d864817ec..d2ccadc247e6f 100644
--- a/arch/x86/platform/pvh/head.S
+++ b/arch/x86/platform/pvh/head.S
@@ -16,6 +16,7 @@
#include <asm/boot.h>
#include <asm/processor-flags.h>
#include <asm/msr.h>
+#include <asm/nospec-branch.h>
#include <xen/interface/elfnote.h>
__HEAD
@@ -105,6 +106,7 @@ SYM_CODE_START_LOCAL(pvh_start_xen)
/* startup_64 expects boot_params in %rsi. */
mov $_pa(pvh_bootparams), %rsi
mov $_pa(startup_64), %rax
+ ANNOTATE_RETPOLINE_SAFE
jmp *%rax
#else /* CONFIG_X86_64 */
diff --git a/arch/x86/power/Makefile b/arch/x86/power/Makefile
index 6907b523e856b..3ff80156f21a6 100644
--- a/arch/x86/power/Makefile
+++ b/arch/x86/power/Makefile
@@ -1,5 +1,4 @@
# SPDX-License-Identifier: GPL-2.0
-OBJECT_FILES_NON_STANDARD_hibernate_asm_$(BITS).o := y
# __restore_processor_state() restores %gs after S3 resume and so should not
# itself be stack-protected
diff --git a/arch/x86/power/hibernate_asm_64.S b/arch/x86/power/hibernate_asm_64.S
index 7918b8415f132..d9bed596d849c 100644
--- a/arch/x86/power/hibernate_asm_64.S
+++ b/arch/x86/power/hibernate_asm_64.S
@@ -21,6 +21,53 @@
#include <asm/asm-offsets.h>
#include <asm/processor-flags.h>
#include <asm/frame.h>
+#include <asm/nospec-branch.h>
+
+ /* code below belongs to the image kernel */
+ .align PAGE_SIZE
+SYM_FUNC_START(restore_registers)
+ /* go back to the original page tables */
+ movq %r9, %cr3
+
+ /* Flush TLB, including "global" things (vmalloc) */
+ movq mmu_cr4_features(%rip), %rax
+ movq %rax, %rdx
+ andq $~(X86_CR4_PGE), %rdx
+ movq %rdx, %cr4; # turn off PGE
+ movq %cr3, %rcx; # flush TLB
+ movq %rcx, %cr3
+ movq %rax, %cr4; # turn PGE back on
+
+ /* We don't restore %rax, it must be 0 anyway */
+ movq $saved_context, %rax
+ movq pt_regs_sp(%rax), %rsp
+ movq pt_regs_bp(%rax), %rbp
+ movq pt_regs_si(%rax), %rsi
+ movq pt_regs_di(%rax), %rdi
+ movq pt_regs_bx(%rax), %rbx
+ movq pt_regs_cx(%rax), %rcx
+ movq pt_regs_dx(%rax), %rdx
+ movq pt_regs_r8(%rax), %r8
+ movq pt_regs_r9(%rax), %r9
+ movq pt_regs_r10(%rax), %r10
+ movq pt_regs_r11(%rax), %r11
+ movq pt_regs_r12(%rax), %r12
+ movq pt_regs_r13(%rax), %r13
+ movq pt_regs_r14(%rax), %r14
+ movq pt_regs_r15(%rax), %r15
+ pushq pt_regs_flags(%rax)
+ popfq
+
+ /* Saved in save_processor_state. */
+ lgdt saved_context_gdt_desc(%rax)
+
+ xorl %eax, %eax
+
+ /* tell the hibernation core that we've just restored the memory */
+ movq %rax, in_suspend(%rip)
+
+ ret
+SYM_FUNC_END(restore_registers)
SYM_FUNC_START(swsusp_arch_suspend)
movq $saved_context, %rax
@@ -52,7 +99,7 @@ SYM_FUNC_START(swsusp_arch_suspend)
ret
SYM_FUNC_END(swsusp_arch_suspend)
-SYM_CODE_START(restore_image)
+SYM_FUNC_START(restore_image)
/* prepare to jump to the image kernel */
movq restore_jump_address(%rip), %r8
movq restore_cr3(%rip), %r9
@@ -66,11 +113,12 @@ SYM_CODE_START(restore_image)
/* jump to relocated restore code */
movq relocated_restore_code(%rip), %rcx
+ ANNOTATE_RETPOLINE_SAFE
jmpq *%rcx
-SYM_CODE_END(restore_image)
+SYM_FUNC_END(restore_image)
/* code below has been relocated to a safe page */
-SYM_CODE_START(core_restore_code)
+SYM_FUNC_START(core_restore_code)
/* switch to temporary page tables */
movq %rax, %cr3
/* flush TLB */
@@ -97,51 +145,6 @@ SYM_CODE_START(core_restore_code)
.Ldone:
/* jump to the restore_registers address from the image header */
+ ANNOTATE_RETPOLINE_SAFE
jmpq *%r8
-SYM_CODE_END(core_restore_code)
-
- /* code below belongs to the image kernel */
- .align PAGE_SIZE
-SYM_FUNC_START(restore_registers)
- /* go back to the original page tables */
- movq %r9, %cr3
-
- /* Flush TLB, including "global" things (vmalloc) */
- movq mmu_cr4_features(%rip), %rax
- movq %rax, %rdx
- andq $~(X86_CR4_PGE), %rdx
- movq %rdx, %cr4; # turn off PGE
- movq %cr3, %rcx; # flush TLB
- movq %rcx, %cr3
- movq %rax, %cr4; # turn PGE back on
-
- /* We don't restore %rax, it must be 0 anyway */
- movq $saved_context, %rax
- movq pt_regs_sp(%rax), %rsp
- movq pt_regs_bp(%rax), %rbp
- movq pt_regs_si(%rax), %rsi
- movq pt_regs_di(%rax), %rdi
- movq pt_regs_bx(%rax), %rbx
- movq pt_regs_cx(%rax), %rcx
- movq pt_regs_dx(%rax), %rdx
- movq pt_regs_r8(%rax), %r8
- movq pt_regs_r9(%rax), %r9
- movq pt_regs_r10(%rax), %r10
- movq pt_regs_r11(%rax), %r11
- movq pt_regs_r12(%rax), %r12
- movq pt_regs_r13(%rax), %r13
- movq pt_regs_r14(%rax), %r14
- movq pt_regs_r15(%rax), %r15
- pushq pt_regs_flags(%rax)
- popfq
-
- /* Saved in save_processor_state. */
- lgdt saved_context_gdt_desc(%rax)
-
- xorl %eax, %eax
-
- /* tell the hibernation core that we've just restored the memory */
- movq %rax, in_suspend(%rip)
-
- ret
-SYM_FUNC_END(restore_registers)
+SYM_FUNC_END(core_restore_code)
diff --git a/arch/x86/tools/Makefile b/arch/x86/tools/Makefile
index 55b1ab378974b..bddfc9a466453 100644
--- a/arch/x86/tools/Makefile
+++ b/arch/x86/tools/Makefile
@@ -29,14 +29,14 @@ posttest: $(obj)/insn_decoder_test vmlinux $(obj)/insn_sanity
hostprogs += insn_decoder_test insn_sanity
# -I needed for generated C source and C source which in the kernel tree.
-HOSTCFLAGS_insn_decoder_test.o := -Wall -I$(objtree)/arch/x86/lib/ -I$(srctree)/arch/x86/include/uapi/ -I$(srctree)/arch/x86/include/ -I$(srctree)/arch/x86/lib/ -I$(srctree)/include/uapi/
+HOSTCFLAGS_insn_decoder_test.o := -Wall -I$(srctree)/tools/arch/x86/lib/ -I$(srctree)/tools/arch/x86/include/ -I$(objtree)/arch/x86/lib/
-HOSTCFLAGS_insn_sanity.o := -Wall -I$(objtree)/arch/x86/lib/ -I$(srctree)/arch/x86/include/ -I$(srctree)/arch/x86/lib/ -I$(srctree)/include/
+HOSTCFLAGS_insn_sanity.o := -Wall -I$(srctree)/tools/arch/x86/lib/ -I$(srctree)/tools/arch/x86/include/ -I$(objtree)/arch/x86/lib/
# Dependencies are also needed.
-$(obj)/insn_decoder_test.o: $(srctree)/arch/x86/lib/insn.c $(srctree)/arch/x86/lib/inat.c $(srctree)/arch/x86/include/asm/inat_types.h $(srctree)/arch/x86/include/asm/inat.h $(srctree)/arch/x86/include/asm/insn.h $(objtree)/arch/x86/lib/inat-tables.c
+$(obj)/insn_decoder_test.o: $(srctree)/tools/arch/x86/lib/insn.c $(srctree)/tools/arch/x86/lib/inat.c $(srctree)/tools/arch/x86/include/asm/inat_types.h $(srctree)/tools/arch/x86/include/asm/inat.h $(srctree)/tools/arch/x86/include/asm/insn.h $(objtree)/arch/x86/lib/inat-tables.c
-$(obj)/insn_sanity.o: $(srctree)/arch/x86/lib/insn.c $(srctree)/arch/x86/lib/inat.c $(srctree)/arch/x86/include/asm/inat_types.h $(srctree)/arch/x86/include/asm/inat.h $(srctree)/arch/x86/include/asm/insn.h $(objtree)/arch/x86/lib/inat-tables.c
+$(obj)/insn_sanity.o: $(srctree)/tools/arch/x86/lib/insn.c $(srctree)/tools/arch/x86/lib/inat.c $(srctree)/tools/arch/x86/include/asm/inat_types.h $(srctree)/tools/arch/x86/include/asm/inat.h $(srctree)/tools/arch/x86/include/asm/insn.h $(objtree)/arch/x86/lib/inat-tables.c
HOST_EXTRACFLAGS += -I$(srctree)/tools/include
hostprogs += relocs
diff --git a/arch/x86/tools/insn_sanity.c b/arch/x86/tools/insn_sanity.c
index 185ceba9d289b..c6a0000ae635f 100644
--- a/arch/x86/tools/insn_sanity.c
+++ b/arch/x86/tools/insn_sanity.c
@@ -14,10 +14,6 @@
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
-
-#define unlikely(cond) (cond)
-#define ARRAY_SIZE(a) (sizeof(a)/sizeof(a[0]))
-
#include <asm/insn.h>
#include <inat.c>
#include <insn.c>
diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c
index b9c577a3cacca..04c5a44b96827 100644
--- a/arch/x86/tools/relocs.c
+++ b/arch/x86/tools/relocs.c
@@ -867,9 +867,11 @@ static int do_reloc32(struct section *sec, Elf_Rel *rel, Elf_Sym *sym,
case R_386_PC32:
case R_386_PC16:
case R_386_PC8:
+ case R_386_PLT32:
/*
- * NONE can be ignored and PC relative relocations don't
- * need to be adjusted.
+ * NONE can be ignored and PC relative relocations don't need
+ * to be adjusted. Because sym must be defined, R_386_PLT32 can
+ * be treated the same way as R_386_PC32.
*/
break;
@@ -910,9 +912,11 @@ static int do_reloc_real(struct section *sec, Elf_Rel *rel, Elf_Sym *sym,
case R_386_PC32:
case R_386_PC16:
case R_386_PC8:
+ case R_386_PLT32:
/*
- * NONE can be ignored and PC relative relocations don't
- * need to be adjusted.
+ * NONE can be ignored and PC relative relocations don't need
+ * to be adjusted. Because sym must be defined, R_386_PLT32 can
+ * be treated the same way as R_386_PC32.
*/
break;
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index fc5c5ba4aacba..40b5779fce21c 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -1,5 +1,4 @@
# SPDX-License-Identifier: GPL-2.0
-OBJECT_FILES_NON_STANDARD_xen-asm.o := y
ifdef CONFIG_FUNCTION_TRACER
# Do not profile debug and lowlevel utilities
diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
index 9a5a50cdaab59..dc0a337f985b6 100644
--- a/arch/x86/xen/enlighten_pv.c
+++ b/arch/x86/xen/enlighten_pv.c
@@ -567,10 +567,16 @@ void noist_exc_debug(struct pt_regs *regs);
DEFINE_IDTENTRY_RAW(xenpv_exc_nmi)
{
- /* On Xen PV, NMI doesn't use IST. The C part is the sane as native. */
+ /* On Xen PV, NMI doesn't use IST. The C part is the same as native. */
exc_nmi(regs);
}
+DEFINE_IDTENTRY_RAW_ERRORCODE(xenpv_exc_double_fault)
+{
+ /* On Xen PV, DF doesn't use IST. The C part is the same as native. */
+ exc_double_fault(regs, error_code);
+}
+
DEFINE_IDTENTRY_RAW(xenpv_exc_debug)
{
/*
@@ -590,6 +596,20 @@ DEFINE_IDTENTRY_RAW(exc_xen_unknown_trap)
BUG();
}
+#ifdef CONFIG_X86_MCE
+DEFINE_IDTENTRY_RAW(xenpv_exc_machine_check)
+{
+ /*
+ * There's no IST on Xen PV, but we still need to dispatch
+ * to the correct handler.
+ */
+ if (user_mode(regs))
+ noist_exc_machine_check(regs);
+ else
+ exc_machine_check(regs);
+}
+#endif
+
struct trap_array_entry {
void (*orig)(void);
void (*xen)(void);
@@ -608,9 +628,9 @@ struct trap_array_entry {
static struct trap_array_entry trap_array[] = {
TRAP_ENTRY_REDIR(exc_debug, true ),
- TRAP_ENTRY(exc_double_fault, true ),
+ TRAP_ENTRY_REDIR(exc_double_fault, true ),
#ifdef CONFIG_X86_MCE
- TRAP_ENTRY(exc_machine_check, true ),
+ TRAP_ENTRY_REDIR(exc_machine_check, true ),
#endif
TRAP_ENTRY_REDIR(exc_nmi, true ),
TRAP_ENTRY(exc_int3, false ),
@@ -1015,8 +1035,6 @@ void __init xen_setup_vcpu_info_placement(void)
*/
if (xen_have_vcpu_info_placement) {
pv_ops.irq.save_fl = __PV_IS_CALLEE_SAVE(xen_save_fl_direct);
- pv_ops.irq.restore_fl =
- __PV_IS_CALLEE_SAVE(xen_restore_fl_direct);
pv_ops.irq.irq_disable =
__PV_IS_CALLEE_SAVE(xen_irq_disable_direct);
pv_ops.irq.irq_enable =
@@ -1053,7 +1071,6 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = {
.read_pmc = xen_read_pmc,
.iret = xen_iret,
- .usergs_sysret64 = xen_sysret64,
.load_tr_desc = paravirt_nop,
.set_ldt = xen_set_ldt,
@@ -1078,9 +1095,6 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = {
#endif
.io_delay = xen_io_delay,
- /* Xen takes care of %gs when switching to usermode for us */
- .swapgs = paravirt_nop,
-
.start_context_switch = paravirt_start_context_switch,
.end_context_switch = xen_end_context_switch,
};
diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c
index 850c93f346c70..dfa091d79c2e1 100644
--- a/arch/x86/xen/irq.c
+++ b/arch/x86/xen/irq.c
@@ -42,28 +42,6 @@ asmlinkage __visible unsigned long xen_save_fl(void)
}
PV_CALLEE_SAVE_REGS_THUNK(xen_save_fl);
-__visible void xen_restore_fl(unsigned long flags)
-{
- struct vcpu_info *vcpu;
-
- /* convert from IF type flag */
- flags = !(flags & X86_EFLAGS_IF);
-
- /* See xen_irq_enable() for why preemption must be disabled. */
- preempt_disable();
- vcpu = this_cpu_read(xen_vcpu);
- vcpu->evtchn_upcall_mask = flags;
-
- if (flags == 0) {
- barrier(); /* unmask then check (avoid races) */
- if (unlikely(vcpu->evtchn_upcall_pending))
- xen_force_evtchn_callback();
- preempt_enable();
- } else
- preempt_enable_no_resched();
-}
-PV_CALLEE_SAVE_REGS_THUNK(xen_restore_fl);
-
asmlinkage __visible void xen_irq_disable(void)
{
/* There's a one instruction preempt window here. We need to
@@ -118,7 +96,6 @@ static void xen_halt(void)
static const struct pv_irq_ops xen_irq_ops __initconst = {
.save_fl = PV_CALLEE_SAVE(xen_save_fl),
- .restore_fl = PV_CALLEE_SAVE(xen_restore_fl),
.irq_disable = PV_CALLEE_SAVE(xen_irq_disable),
.irq_enable = PV_CALLEE_SAVE(xen_irq_enable),
diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S
index 53cf8aa35032d..1e626444712be 100644
--- a/arch/x86/xen/xen-asm.S
+++ b/arch/x86/xen/xen-asm.S
@@ -14,6 +14,7 @@
#include <asm/thread_info.h>
#include <asm/asm.h>
#include <asm/frame.h>
+#include <asm/unwind_hints.h>
#include <xen/interface/xen.h>
@@ -72,34 +73,6 @@ SYM_FUNC_START(xen_save_fl_direct)
ret
SYM_FUNC_END(xen_save_fl_direct)
-
-/*
- * In principle the caller should be passing us a value return from
- * xen_save_fl_direct, but for robustness sake we test only the
- * X86_EFLAGS_IF flag rather than the whole byte. After setting the
- * interrupt mask state, it checks for unmasked pending events and
- * enters the hypervisor to get them delivered if so.
- */
-SYM_FUNC_START(xen_restore_fl_direct)
- FRAME_BEGIN
- testw $X86_EFLAGS_IF, %di
- setz PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
- /*
- * Preempt here doesn't matter because that will deal with any
- * pending interrupts. The pending check may end up being run
- * on the wrong CPU, but that doesn't hurt.
- */
-
- /* check for unmasked and pending */
- cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_pending
- jnz 1f
- call check_events
-1:
- FRAME_END
- ret
-SYM_FUNC_END(xen_restore_fl_direct)
-
-
/*
* Force an event check by making a hypercall, but preserve regs
* before making the call.
@@ -146,6 +119,7 @@ SYM_FUNC_END(xen_read_cr2_direct);
.macro xen_pv_trap name
SYM_CODE_START(xen_\name)
+ UNWIND_HINT_EMPTY
pop %rcx
pop %r11
jmp \name
@@ -161,7 +135,7 @@ xen_pv_trap asm_exc_overflow
xen_pv_trap asm_exc_bounds
xen_pv_trap asm_exc_invalid_op
xen_pv_trap asm_exc_device_not_available
-xen_pv_trap asm_exc_double_fault
+xen_pv_trap asm_xenpv_exc_double_fault
xen_pv_trap asm_exc_coproc_segment_overrun
xen_pv_trap asm_exc_invalid_tss
xen_pv_trap asm_exc_segment_not_present
@@ -172,7 +146,7 @@ xen_pv_trap asm_exc_spurious_interrupt_bug
xen_pv_trap asm_exc_coprocessor_error
xen_pv_trap asm_exc_alignment_check
#ifdef CONFIG_X86_MCE
-xen_pv_trap asm_exc_machine_check
+xen_pv_trap asm_xenpv_exc_machine_check
#endif /* CONFIG_X86_MCE */
xen_pv_trap asm_exc_simd_coprocessor_error
#ifdef CONFIG_IA32_EMULATION
@@ -185,6 +159,7 @@ xen_pv_trap asm_exc_xen_hypervisor_callback
SYM_CODE_START(xen_early_idt_handler_array)
i = 0
.rept NUM_EXCEPTION_VECTORS
+ UNWIND_HINT_EMPTY
pop %rcx
pop %r11
jmp early_idt_handler_array + i*EARLY_IDT_HANDLER_SIZE
@@ -211,30 +186,11 @@ hypercall_iret = hypercall_page + __HYPERVISOR_iret * 32
* rsp->rax }
*/
SYM_CODE_START(xen_iret)
+ UNWIND_HINT_EMPTY
pushq $0
jmp hypercall_iret
SYM_CODE_END(xen_iret)
-SYM_CODE_START(xen_sysret64)
- /*
- * We're already on the usermode stack at this point, but
- * still with the kernel gs, so we can easily switch back.
- *
- * tss.sp2 is scratch space.
- */
- movq %rsp, PER_CPU_VAR(cpu_tss_rw + TSS_sp2)
- movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
-
- pushq $__USER_DS
- pushq PER_CPU_VAR(cpu_tss_rw + TSS_sp2)
- pushq %r11
- pushq $__USER_CS
- pushq %rcx
-
- pushq $VGCF_in_syscall
- jmp hypercall_iret
-SYM_CODE_END(xen_sysret64)
-
/*
* Xen handles syscall callbacks much like ordinary exceptions, which
* means we have:
@@ -251,7 +207,8 @@ SYM_CODE_END(xen_sysret64)
*/
/* Normal 64-bit system call target */
-SYM_FUNC_START(xen_syscall_target)
+SYM_CODE_START(xen_syscall_target)
+ UNWIND_HINT_EMPTY
popq %rcx
popq %r11
@@ -264,12 +221,13 @@ SYM_FUNC_START(xen_syscall_target)
movq $__USER_CS, 1*8(%rsp)
jmp entry_SYSCALL_64_after_hwframe
-SYM_FUNC_END(xen_syscall_target)
+SYM_CODE_END(xen_syscall_target)
#ifdef CONFIG_IA32_EMULATION
/* 32-bit compat syscall target */
-SYM_FUNC_START(xen_syscall32_target)
+SYM_CODE_START(xen_syscall32_target)
+ UNWIND_HINT_EMPTY
popq %rcx
popq %r11
@@ -282,10 +240,11 @@ SYM_FUNC_START(xen_syscall32_target)
movq $__USER32_CS, 1*8(%rsp)
jmp entry_SYSCALL_compat_after_hwframe
-SYM_FUNC_END(xen_syscall32_target)
+SYM_CODE_END(xen_syscall32_target)
/* 32-bit compat sysenter target */
-SYM_FUNC_START(xen_sysenter_target)
+SYM_CODE_START(xen_sysenter_target)
+ UNWIND_HINT_EMPTY
/*
* NB: Xen is polite and clears TF from EFLAGS for us. This means
* that we don't need to guard against single step exceptions here.
@@ -302,17 +261,18 @@ SYM_FUNC_START(xen_sysenter_target)
movq $__USER32_CS, 1*8(%rsp)
jmp entry_SYSENTER_compat_after_hwframe
-SYM_FUNC_END(xen_sysenter_target)
+SYM_CODE_END(xen_sysenter_target)
#else /* !CONFIG_IA32_EMULATION */
-SYM_FUNC_START_ALIAS(xen_syscall32_target)
-SYM_FUNC_START(xen_sysenter_target)
+SYM_CODE_START(xen_syscall32_target)
+SYM_CODE_START(xen_sysenter_target)
+ UNWIND_HINT_EMPTY
lea 16(%rsp), %rsp /* strip %rcx, %r11 */
mov $-ENOSYS, %rax
pushq $0
jmp hypercall_iret
-SYM_FUNC_END(xen_sysenter_target)
-SYM_FUNC_END_ALIAS(xen_syscall32_target)
+SYM_CODE_END(xen_sysenter_target)
+SYM_CODE_END(xen_syscall32_target)
#endif /* CONFIG_IA32_EMULATION */
diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S
index 2d7c8f34f56c7..cb6538ae2fe07 100644
--- a/arch/x86/xen/xen-head.S
+++ b/arch/x86/xen/xen-head.S
@@ -68,8 +68,9 @@ SYM_CODE_END(asm_cpu_bringup_and_idle)
.balign PAGE_SIZE
SYM_CODE_START(hypercall_page)
.rept (PAGE_SIZE / 32)
- UNWIND_HINT_EMPTY
- .skip 32
+ UNWIND_HINT_FUNC
+ .skip 31, 0x90
+ ret
.endr
#define HYPERCALL(n) \
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 9546c3384c759..8d7ec49a35fbb 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -131,15 +131,12 @@ static inline void __init xen_efi_init(struct boot_params *boot_params)
__visible void xen_irq_enable_direct(void);
__visible void xen_irq_disable_direct(void);
__visible unsigned long xen_save_fl_direct(void);
-__visible void xen_restore_fl_direct(unsigned long);
__visible unsigned long xen_read_cr2(void);
__visible unsigned long xen_read_cr2_direct(void);
/* These are not functions, and cannot be called normally */
__visible void xen_iret(void);
-__visible void xen_sysret32(void);
-__visible void xen_sysret64(void);
extern int xen_panic_handler_init(void);
diff --git a/drivers/clocksource/Kconfig b/drivers/clocksource/Kconfig
index 14c7c47124787..d184000eb09e1 100644
--- a/drivers/clocksource/Kconfig
+++ b/drivers/clocksource/Kconfig
@@ -197,12 +197,6 @@ config CLPS711X_TIMER
help
Enables support for the Cirrus Logic PS711 timer.
-config ATLAS7_TIMER
- bool "Atlas7 timer driver" if COMPILE_TEST
- select CLKSRC_MMIO
- help
- Enables support for the Atlas7 timer.
-
config MXS_TIMER
bool "MXS timer driver" if COMPILE_TEST
select CLKSRC_MMIO
@@ -210,19 +204,6 @@ config MXS_TIMER
help
Enables support for the MXS timer.
-config PRIMA2_TIMER
- bool "Prima2 timer driver" if COMPILE_TEST
- select CLKSRC_MMIO
- help
- Enables support for the Prima2 timer.
-
-config U300_TIMER
- bool "U300 timer driver" if COMPILE_TEST
- depends on ARM
- select CLKSRC_MMIO
- help
- Enables support for the U300 timer.
-
config NSPIRE_TIMER
bool "NSpire timer driver" if COMPILE_TEST
select CLKSRC_MMIO
@@ -242,15 +223,6 @@ config INTEGRATOR_AP_TIMER
help
Enables support for the Integrator-AP timer.
-config CLKSRC_EFM32
- bool "Clocksource for Energy Micro's EFM32 SoCs" if !ARCH_EFM32
- depends on OF && ARM && (ARCH_EFM32 || COMPILE_TEST)
- select CLKSRC_MMIO
- default ARCH_EFM32
- help
- Support to use the timers of EFM32 SoCs as clock source and clock
- event device.
-
config CLKSRC_LPC32XX
bool "Clocksource for LPC32XX" if COMPILE_TEST
depends on HAS_IOMEM
@@ -567,14 +539,6 @@ config CLKSRC_MIPS_GIC
select CLOCKSOURCE_WATCHDOG
select TIMER_OF
-config CLKSRC_TANGO_XTAL
- bool "Clocksource for Tango SoC" if COMPILE_TEST
- depends on ARM
- select TIMER_OF
- select CLKSRC_MMIO
- help
- This enables the clocksource for Tango SoC.
-
config CLKSRC_PXA
bool "Clocksource for PXA or SA-11x0 platform" if COMPILE_TEST
depends on HAS_IOMEM
diff --git a/drivers/clocksource/Makefile b/drivers/clocksource/Makefile
index 3c75cbbf8533b..c17ee32a71515 100644
--- a/drivers/clocksource/Makefile
+++ b/drivers/clocksource/Makefile
@@ -30,11 +30,8 @@ obj-$(CONFIG_ARMADA_370_XP_TIMER) += timer-armada-370-xp.o
obj-$(CONFIG_ORION_TIMER) += timer-orion.o
obj-$(CONFIG_BCM2835_TIMER) += bcm2835_timer.o
obj-$(CONFIG_CLPS711X_TIMER) += clps711x-timer.o
-obj-$(CONFIG_ATLAS7_TIMER) += timer-atlas7.o
obj-$(CONFIG_MXS_TIMER) += mxs_timer.o
obj-$(CONFIG_CLKSRC_PXA) += timer-pxa.o
-obj-$(CONFIG_PRIMA2_TIMER) += timer-prima2.o
-obj-$(CONFIG_U300_TIMER) += timer-u300.o
obj-$(CONFIG_SUN4I_TIMER) += timer-sun4i.o
obj-$(CONFIG_SUN5I_HSTIMER) += timer-sun5i.o
obj-$(CONFIG_MESON6_TIMER) += timer-meson6.o
@@ -43,7 +40,6 @@ obj-$(CONFIG_VT8500_TIMER) += timer-vt8500.o
obj-$(CONFIG_NSPIRE_TIMER) += timer-zevio.o
obj-$(CONFIG_BCM_KONA_TIMER) += bcm_kona_timer.o
obj-$(CONFIG_CADENCE_TTC_TIMER) += timer-cadence-ttc.o
-obj-$(CONFIG_CLKSRC_EFM32) += timer-efm32.o
obj-$(CONFIG_CLKSRC_STM32) += timer-stm32.o
obj-$(CONFIG_CLKSRC_STM32_LP) += timer-stm32-lp.o
obj-$(CONFIG_CLKSRC_EXYNOS_MCT) += exynos_mct.o
@@ -73,7 +69,6 @@ obj-$(CONFIG_KEYSTONE_TIMER) += timer-keystone.o
obj-$(CONFIG_INTEGRATOR_AP_TIMER) += timer-integrator-ap.o
obj-$(CONFIG_CLKSRC_VERSATILE) += timer-versatile.o
obj-$(CONFIG_CLKSRC_MIPS_GIC) += mips-gic-timer.o
-obj-$(CONFIG_CLKSRC_TANGO_XTAL) += timer-tango-xtal.o
obj-$(CONFIG_CLKSRC_IMX_GPT) += timer-imx-gpt.o
obj-$(CONFIG_CLKSRC_IMX_TPM) += timer-imx-tpm.o
obj-$(CONFIG_TIMER_IMX_SYS_CTR) += timer-imx-sysctr.o
diff --git a/drivers/clocksource/timer-atlas7.c b/drivers/clocksource/timer-atlas7.c
deleted file mode 100644
index c21c91c2bc568..0000000000000
--- a/drivers/clocksource/timer-atlas7.c
+++ /dev/null
@@ -1,281 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * System timer for CSR SiRFprimaII
- *
- * Copyright (c) 2011 Cambridge Silicon Radio Limited, a CSR plc group company.
- */
-
-#include <linux/kernel.h>
-#include <linux/interrupt.h>
-#include <linux/clockchips.h>
-#include <linux/clocksource.h>
-#include <linux/cpu.h>
-#include <linux/bitops.h>
-#include <linux/irq.h>
-#include <linux/clk.h>
-#include <linux/slab.h>
-#include <linux/of.h>
-#include <linux/of_irq.h>
-#include <linux/of_address.h>
-#include <linux/sched_clock.h>
-
-#define SIRFSOC_TIMER_32COUNTER_0_CTRL 0x0000
-#define SIRFSOC_TIMER_32COUNTER_1_CTRL 0x0004
-#define SIRFSOC_TIMER_MATCH_0 0x0018
-#define SIRFSOC_TIMER_MATCH_1 0x001c
-#define SIRFSOC_TIMER_COUNTER_0 0x0048
-#define SIRFSOC_TIMER_COUNTER_1 0x004c
-#define SIRFSOC_TIMER_INTR_STATUS 0x0060
-#define SIRFSOC_TIMER_WATCHDOG_EN 0x0064
-#define SIRFSOC_TIMER_64COUNTER_CTRL 0x0068
-#define SIRFSOC_TIMER_64COUNTER_LO 0x006c
-#define SIRFSOC_TIMER_64COUNTER_HI 0x0070
-#define SIRFSOC_TIMER_64COUNTER_LOAD_LO 0x0074
-#define SIRFSOC_TIMER_64COUNTER_LOAD_HI 0x0078
-#define SIRFSOC_TIMER_64COUNTER_RLATCHED_LO 0x007c
-#define SIRFSOC_TIMER_64COUNTER_RLATCHED_HI 0x0080
-
-#define SIRFSOC_TIMER_REG_CNT 6
-
-static unsigned long atlas7_timer_rate;
-
-static const u32 sirfsoc_timer_reg_list[SIRFSOC_TIMER_REG_CNT] = {
- SIRFSOC_TIMER_WATCHDOG_EN,
- SIRFSOC_TIMER_32COUNTER_0_CTRL,
- SIRFSOC_TIMER_32COUNTER_1_CTRL,
- SIRFSOC_TIMER_64COUNTER_CTRL,
- SIRFSOC_TIMER_64COUNTER_RLATCHED_LO,
- SIRFSOC_TIMER_64COUNTER_RLATCHED_HI,
-};
-
-static u32 sirfsoc_timer_reg_val[SIRFSOC_TIMER_REG_CNT];
-
-static void __iomem *sirfsoc_timer_base;
-
-/* disable count and interrupt */
-static inline void sirfsoc_timer_count_disable(int idx)
-{
- writel_relaxed(readl_relaxed(sirfsoc_timer_base + SIRFSOC_TIMER_32COUNTER_0_CTRL + 4 * idx) & ~0x7,
- sirfsoc_timer_base + SIRFSOC_TIMER_32COUNTER_0_CTRL + 4 * idx);
-}
-
-/* enable count and interrupt */
-static inline void sirfsoc_timer_count_enable(int idx)
-{
- writel_relaxed(readl_relaxed(sirfsoc_timer_base + SIRFSOC_TIMER_32COUNTER_0_CTRL + 4 * idx) | 0x3,
- sirfsoc_timer_base + SIRFSOC_TIMER_32COUNTER_0_CTRL + 4 * idx);
-}
-
-/* timer interrupt handler */
-static irqreturn_t sirfsoc_timer_interrupt(int irq, void *dev_id)
-{
- struct clock_event_device *ce = dev_id;
- int cpu = smp_processor_id();
-
- /* clear timer interrupt */
- writel_relaxed(BIT(cpu), sirfsoc_timer_base + SIRFSOC_TIMER_INTR_STATUS);
-
- if (clockevent_state_oneshot(ce))
- sirfsoc_timer_count_disable(cpu);
-
- ce->event_handler(ce);
-
- return IRQ_HANDLED;
-}
-
-/* read 64-bit timer counter */
-static u64 sirfsoc_timer_read(struct clocksource *cs)
-{
- u64 cycles;
-
- writel_relaxed((readl_relaxed(sirfsoc_timer_base + SIRFSOC_TIMER_64COUNTER_CTRL) |
- BIT(0)) & ~BIT(1), sirfsoc_timer_base + SIRFSOC_TIMER_64COUNTER_CTRL);
-
- cycles = readl_relaxed(sirfsoc_timer_base + SIRFSOC_TIMER_64COUNTER_RLATCHED_HI);
- cycles = (cycles << 32) | readl_relaxed(sirfsoc_timer_base + SIRFSOC_TIMER_64COUNTER_RLATCHED_LO);
-
- return cycles;
-}
-
-static int sirfsoc_timer_set_next_event(unsigned long delta,
- struct clock_event_device *ce)
-{
- int cpu = smp_processor_id();
-
- /* disable timer first, then modify the related registers */
- sirfsoc_timer_count_disable(cpu);
-
- writel_relaxed(0, sirfsoc_timer_base + SIRFSOC_TIMER_COUNTER_0 +
- 4 * cpu);
- writel_relaxed(delta, sirfsoc_timer_base + SIRFSOC_TIMER_MATCH_0 +
- 4 * cpu);
-
- /* enable the tick */
- sirfsoc_timer_count_enable(cpu);
-
- return 0;
-}
-
-/* Oneshot is enabled in set_next_event */
-static int sirfsoc_timer_shutdown(struct clock_event_device *evt)
-{
- sirfsoc_timer_count_disable(smp_processor_id());
- return 0;
-}
-
-static void sirfsoc_clocksource_suspend(struct clocksource *cs)
-{
- int i;
-
- for (i = 0; i < SIRFSOC_TIMER_REG_CNT; i++)
- sirfsoc_timer_reg_val[i] = readl_relaxed(sirfsoc_timer_base + sirfsoc_timer_reg_list[i]);
-}
-
-static void sirfsoc_clocksource_resume(struct clocksource *cs)
-{
- int i;
-
- for (i = 0; i < SIRFSOC_TIMER_REG_CNT - 2; i++)
- writel_relaxed(sirfsoc_timer_reg_val[i], sirfsoc_timer_base + sirfsoc_timer_reg_list[i]);
-
- writel_relaxed(sirfsoc_timer_reg_val[SIRFSOC_TIMER_REG_CNT - 2],
- sirfsoc_timer_base + SIRFSOC_TIMER_64COUNTER_LOAD_LO);
- writel_relaxed(sirfsoc_timer_reg_val[SIRFSOC_TIMER_REG_CNT - 1],
- sirfsoc_timer_base + SIRFSOC_TIMER_64COUNTER_LOAD_HI);
-
- writel_relaxed(readl_relaxed(sirfsoc_timer_base + SIRFSOC_TIMER_64COUNTER_CTRL) |
- BIT(1) | BIT(0), sirfsoc_timer_base + SIRFSOC_TIMER_64COUNTER_CTRL);
-}
-
-static struct clock_event_device __percpu *sirfsoc_clockevent;
-
-static struct clocksource sirfsoc_clocksource = {
- .name = "sirfsoc_clocksource",
- .rating = 200,
- .mask = CLOCKSOURCE_MASK(64),
- .flags = CLOCK_SOURCE_IS_CONTINUOUS,
- .read = sirfsoc_timer_read,
- .suspend = sirfsoc_clocksource_suspend,
- .resume = sirfsoc_clocksource_resume,
-};
-
-static unsigned int sirfsoc_timer_irq, sirfsoc_timer1_irq;
-
-static int sirfsoc_local_timer_starting_cpu(unsigned int cpu)
-{
- struct clock_event_device *ce = per_cpu_ptr(sirfsoc_clockevent, cpu);
- unsigned int irq;
- const char *name;
-
- if (cpu == 0) {
- irq = sirfsoc_timer_irq;
- name = "sirfsoc_timer0";
- } else {
- irq = sirfsoc_timer1_irq;
- name = "sirfsoc_timer1";
- }
-
- ce->irq = irq;
- ce->name = "local_timer";
- ce->features = CLOCK_EVT_FEAT_ONESHOT;
- ce->rating = 200;
- ce->set_state_shutdown = sirfsoc_timer_shutdown;
- ce->set_state_oneshot = sirfsoc_timer_shutdown;
- ce->tick_resume = sirfsoc_timer_shutdown;
- ce->set_next_event = sirfsoc_timer_set_next_event;
- clockevents_calc_mult_shift(ce, atlas7_timer_rate, 60);
- ce->max_delta_ns = clockevent_delta2ns(-2, ce);
- ce->max_delta_ticks = (unsigned long)-2;
- ce->min_delta_ns = clockevent_delta2ns(2, ce);
- ce->min_delta_ticks = 2;
- ce->cpumask = cpumask_of(cpu);
-
- BUG_ON(request_irq(ce->irq, sirfsoc_timer_interrupt,
- IRQF_TIMER | IRQF_NOBALANCING, name, ce));
- irq_force_affinity(ce->irq, cpumask_of(cpu));
-
- clockevents_register_device(ce);
- return 0;
-}
-
-static int sirfsoc_local_timer_dying_cpu(unsigned int cpu)
-{
- struct clock_event_device *ce = per_cpu_ptr(sirfsoc_clockevent, cpu);
-
- sirfsoc_timer_count_disable(1);
-
- if (cpu == 0)
- free_irq(sirfsoc_timer_irq, ce);
- else
- free_irq(sirfsoc_timer1_irq, ce);
- return 0;
-}
-
-static int __init sirfsoc_clockevent_init(void)
-{
- sirfsoc_clockevent = alloc_percpu(struct clock_event_device);
- BUG_ON(!sirfsoc_clockevent);
-
- /* Install and invoke hotplug callbacks */
- return cpuhp_setup_state(CPUHP_AP_MARCO_TIMER_STARTING,
- "clockevents/marco:starting",
- sirfsoc_local_timer_starting_cpu,
- sirfsoc_local_timer_dying_cpu);
-}
-
-/* initialize the kernel jiffy timer source */
-static int __init sirfsoc_atlas7_timer_init(struct device_node *np)
-{
- struct clk *clk;
-
- clk = of_clk_get(np, 0);
- BUG_ON(IS_ERR(clk));
-
- BUG_ON(clk_prepare_enable(clk));
-
- atlas7_timer_rate = clk_get_rate(clk);
-
- /* timer dividers: 0, not divided */
- writel_relaxed(0, sirfsoc_timer_base + SIRFSOC_TIMER_64COUNTER_CTRL);
- writel_relaxed(0, sirfsoc_timer_base + SIRFSOC_TIMER_32COUNTER_0_CTRL);
- writel_relaxed(0, sirfsoc_timer_base + SIRFSOC_TIMER_32COUNTER_1_CTRL);
-
- /* Initialize timer counters to 0 */
- writel_relaxed(0, sirfsoc_timer_base + SIRFSOC_TIMER_64COUNTER_LOAD_LO);
- writel_relaxed(0, sirfsoc_timer_base + SIRFSOC_TIMER_64COUNTER_LOAD_HI);
- writel_relaxed(readl_relaxed(sirfsoc_timer_base + SIRFSOC_TIMER_64COUNTER_CTRL) |
- BIT(1) | BIT(0), sirfsoc_timer_base + SIRFSOC_TIMER_64COUNTER_CTRL);
- writel_relaxed(0, sirfsoc_timer_base + SIRFSOC_TIMER_COUNTER_0);
- writel_relaxed(0, sirfsoc_timer_base + SIRFSOC_TIMER_COUNTER_1);
-
- /* Clear all interrupts */
- writel_relaxed(0xFFFF, sirfsoc_timer_base + SIRFSOC_TIMER_INTR_STATUS);
-
- BUG_ON(clocksource_register_hz(&sirfsoc_clocksource, atlas7_timer_rate));
-
- return sirfsoc_clockevent_init();
-}
-
-static int __init sirfsoc_of_timer_init(struct device_node *np)
-{
- sirfsoc_timer_base = of_iomap(np, 0);
- if (!sirfsoc_timer_base) {
- pr_err("unable to map timer cpu registers\n");
- return -ENXIO;
- }
-
- sirfsoc_timer_irq = irq_of_parse_and_map(np, 0);
- if (!sirfsoc_timer_irq) {
- pr_err("No irq passed for timer0 via DT\n");
- return -EINVAL;
- }
-
- sirfsoc_timer1_irq = irq_of_parse_and_map(np, 1);
- if (!sirfsoc_timer1_irq) {
- pr_err("No irq passed for timer1 via DT\n");
- return -EINVAL;
- }
-
- return sirfsoc_atlas7_timer_init(np);
-}
-TIMER_OF_DECLARE(sirfsoc_atlas7_timer, "sirf,atlas7-tick", sirfsoc_of_timer_init);
diff --git a/drivers/clocksource/timer-davinci.c b/drivers/clocksource/timer-davinci.c
index bb4eee31ae082..9996c05425200 100644
--- a/drivers/clocksource/timer-davinci.c
+++ b/drivers/clocksource/timer-davinci.c
@@ -7,6 +7,8 @@
* (with tiny parts adopted from code by Kevin Hilman <khilman@baylibre.com>)
*/
+#define pr_fmt(fmt) "%s: " fmt, __func__
+
#include <linux/clk.h>
#include <linux/clockchips.h>
#include <linux/interrupt.h>
@@ -17,9 +19,6 @@
#include <clocksource/timer-davinci.h>
-#undef pr_fmt
-#define pr_fmt(fmt) "%s: " fmt, __func__
-
#define DAVINCI_TIMER_REG_TIM12 0x10
#define DAVINCI_TIMER_REG_TIM34 0x14
#define DAVINCI_TIMER_REG_PRD12 0x18
diff --git a/drivers/clocksource/timer-efm32.c b/drivers/clocksource/timer-efm32.c
deleted file mode 100644
index 441a4b916841d..0000000000000
--- a/drivers/clocksource/timer-efm32.c
+++ /dev/null
@@ -1,278 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright (C) 2013 Pengutronix
- * Uwe Kleine-Koenig <u.kleine-koenig@pengutronix.de>
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/kernel.h>
-#include <linux/clocksource.h>
-#include <linux/clockchips.h>
-#include <linux/irq.h>
-#include <linux/interrupt.h>
-#include <linux/of.h>
-#include <linux/of_address.h>
-#include <linux/of_irq.h>
-#include <linux/clk.h>
-
-#define TIMERn_CTRL 0x00
-#define TIMERn_CTRL_PRESC(val) (((val) & 0xf) << 24)
-#define TIMERn_CTRL_PRESC_1024 TIMERn_CTRL_PRESC(10)
-#define TIMERn_CTRL_CLKSEL(val) (((val) & 0x3) << 16)
-#define TIMERn_CTRL_CLKSEL_PRESCHFPERCLK TIMERn_CTRL_CLKSEL(0)
-#define TIMERn_CTRL_OSMEN 0x00000010
-#define TIMERn_CTRL_MODE(val) (((val) & 0x3) << 0)
-#define TIMERn_CTRL_MODE_UP TIMERn_CTRL_MODE(0)
-#define TIMERn_CTRL_MODE_DOWN TIMERn_CTRL_MODE(1)
-
-#define TIMERn_CMD 0x04
-#define TIMERn_CMD_START 0x00000001
-#define TIMERn_CMD_STOP 0x00000002
-
-#define TIMERn_IEN 0x0c
-#define TIMERn_IF 0x10
-#define TIMERn_IFS 0x14
-#define TIMERn_IFC 0x18
-#define TIMERn_IRQ_UF 0x00000002
-
-#define TIMERn_TOP 0x1c
-#define TIMERn_CNT 0x24
-
-struct efm32_clock_event_ddata {
- struct clock_event_device evtdev;
- void __iomem *base;
- unsigned periodic_top;
-};
-
-static int efm32_clock_event_shutdown(struct clock_event_device *evtdev)
-{
- struct efm32_clock_event_ddata *ddata =
- container_of(evtdev, struct efm32_clock_event_ddata, evtdev);
-
- writel_relaxed(TIMERn_CMD_STOP, ddata->base + TIMERn_CMD);
- return 0;
-}
-
-static int efm32_clock_event_set_oneshot(struct clock_event_device *evtdev)
-{
- struct efm32_clock_event_ddata *ddata =
- container_of(evtdev, struct efm32_clock_event_ddata, evtdev);
-
- writel_relaxed(TIMERn_CMD_STOP, ddata->base + TIMERn_CMD);
- writel_relaxed(TIMERn_CTRL_PRESC_1024 |
- TIMERn_CTRL_CLKSEL_PRESCHFPERCLK |
- TIMERn_CTRL_OSMEN |
- TIMERn_CTRL_MODE_DOWN,
- ddata->base + TIMERn_CTRL);
- return 0;
-}
-
-static int efm32_clock_event_set_periodic(struct clock_event_device *evtdev)
-{
- struct efm32_clock_event_ddata *ddata =
- container_of(evtdev, struct efm32_clock_event_ddata, evtdev);
-
- writel_relaxed(TIMERn_CMD_STOP, ddata->base + TIMERn_CMD);
- writel_relaxed(ddata->periodic_top, ddata->base + TIMERn_TOP);
- writel_relaxed(TIMERn_CTRL_PRESC_1024 |
- TIMERn_CTRL_CLKSEL_PRESCHFPERCLK |
- TIMERn_CTRL_MODE_DOWN,
- ddata->base + TIMERn_CTRL);
- writel_relaxed(TIMERn_CMD_START, ddata->base + TIMERn_CMD);
- return 0;
-}
-
-static int efm32_clock_event_set_next_event(unsigned long evt,
- struct clock_event_device *evtdev)
-{
- struct efm32_clock_event_ddata *ddata =
- container_of(evtdev, struct efm32_clock_event_ddata, evtdev);
-
- writel_relaxed(TIMERn_CMD_STOP, ddata->base + TIMERn_CMD);
- writel_relaxed(evt, ddata->base + TIMERn_CNT);
- writel_relaxed(TIMERn_CMD_START, ddata->base + TIMERn_CMD);
-
- return 0;
-}
-
-static irqreturn_t efm32_clock_event_handler(int irq, void *dev_id)
-{
- struct efm32_clock_event_ddata *ddata = dev_id;
-
- writel_relaxed(TIMERn_IRQ_UF, ddata->base + TIMERn_IFC);
-
- ddata->evtdev.event_handler(&ddata->evtdev);
-
- return IRQ_HANDLED;
-}
-
-static struct efm32_clock_event_ddata clock_event_ddata = {
- .evtdev = {
- .name = "efm32 clockevent",
- .features = CLOCK_EVT_FEAT_ONESHOT | CLOCK_EVT_FEAT_PERIODIC,
- .set_state_shutdown = efm32_clock_event_shutdown,
- .set_state_periodic = efm32_clock_event_set_periodic,
- .set_state_oneshot = efm32_clock_event_set_oneshot,
- .set_next_event = efm32_clock_event_set_next_event,
- .rating = 200,
- },
-};
-
-static int __init efm32_clocksource_init(struct device_node *np)
-{
- struct clk *clk;
- void __iomem *base;
- unsigned long rate;
- int ret;
-
- clk = of_clk_get(np, 0);
- if (IS_ERR(clk)) {
- ret = PTR_ERR(clk);
- pr_err("failed to get clock for clocksource (%d)\n", ret);
- goto err_clk_get;
- }
-
- ret = clk_prepare_enable(clk);
- if (ret) {
- pr_err("failed to enable timer clock for clocksource (%d)\n",
- ret);
- goto err_clk_enable;
- }
- rate = clk_get_rate(clk);
-
- base = of_iomap(np, 0);
- if (!base) {
- ret = -EADDRNOTAVAIL;
- pr_err("failed to map registers for clocksource\n");
- goto err_iomap;
- }
-
- writel_relaxed(TIMERn_CTRL_PRESC_1024 |
- TIMERn_CTRL_CLKSEL_PRESCHFPERCLK |
- TIMERn_CTRL_MODE_UP, base + TIMERn_CTRL);
- writel_relaxed(TIMERn_CMD_START, base + TIMERn_CMD);
-
- ret = clocksource_mmio_init(base + TIMERn_CNT, "efm32 timer",
- DIV_ROUND_CLOSEST(rate, 1024), 200, 16,
- clocksource_mmio_readl_up);
- if (ret) {
- pr_err("failed to init clocksource (%d)\n", ret);
- goto err_clocksource_init;
- }
-
- return 0;
-
-err_clocksource_init:
-
- iounmap(base);
-err_iomap:
-
- clk_disable_unprepare(clk);
-err_clk_enable:
-
- clk_put(clk);
-err_clk_get:
-
- return ret;
-}
-
-static int __init efm32_clockevent_init(struct device_node *np)
-{
- struct clk *clk;
- void __iomem *base;
- unsigned long rate;
- int irq;
- int ret;
-
- clk = of_clk_get(np, 0);
- if (IS_ERR(clk)) {
- ret = PTR_ERR(clk);
- pr_err("failed to get clock for clockevent (%d)\n", ret);
- goto err_clk_get;
- }
-
- ret = clk_prepare_enable(clk);
- if (ret) {
- pr_err("failed to enable timer clock for clockevent (%d)\n",
- ret);
- goto err_clk_enable;
- }
- rate = clk_get_rate(clk);
-
- base = of_iomap(np, 0);
- if (!base) {
- ret = -EADDRNOTAVAIL;
- pr_err("failed to map registers for clockevent\n");
- goto err_iomap;
- }
-
- irq = irq_of_parse_and_map(np, 0);
- if (!irq) {
- ret = -ENOENT;
- pr_err("failed to get irq for clockevent\n");
- goto err_get_irq;
- }
-
- writel_relaxed(TIMERn_IRQ_UF, base + TIMERn_IEN);
-
- clock_event_ddata.base = base;
- clock_event_ddata.periodic_top = DIV_ROUND_CLOSEST(rate, 1024 * HZ);
-
- clockevents_config_and_register(&clock_event_ddata.evtdev,
- DIV_ROUND_CLOSEST(rate, 1024),
- 0xf, 0xffff);
-
- ret = request_irq(irq, efm32_clock_event_handler, IRQF_TIMER,
- "efm32 clockevent", &clock_event_ddata);
- if (ret) {
- pr_err("Failed setup irq\n");
- goto err_setup_irq;
- }
-
- return 0;
-
-err_setup_irq:
-err_get_irq:
-
- iounmap(base);
-err_iomap:
-
- clk_disable_unprepare(clk);
-err_clk_enable:
-
- clk_put(clk);
-err_clk_get:
-
- return ret;
-}
-
-/*
- * This function asserts that we have exactly one clocksource and one
- * clock_event_device in the end.
- */
-static int __init efm32_timer_init(struct device_node *np)
-{
- static int has_clocksource, has_clockevent;
- int ret = 0;
-
- if (!has_clocksource) {
- ret = efm32_clocksource_init(np);
- if (!ret) {
- has_clocksource = 1;
- return 0;
- }
- }
-
- if (!has_clockevent) {
- ret = efm32_clockevent_init(np);
- if (!ret) {
- has_clockevent = 1;
- return 0;
- }
- }
-
- return ret;
-}
-TIMER_OF_DECLARE(efm32compat, "efm32,timer", efm32_timer_init);
-TIMER_OF_DECLARE(efm32, "energymicro,efm32-timer", efm32_timer_init);
diff --git a/drivers/clocksource/timer-microchip-pit64b.c b/drivers/clocksource/timer-microchip-pit64b.c
index 59e11ca8ee73e..ab623b25a47b7 100644
--- a/drivers/clocksource/timer-microchip-pit64b.c
+++ b/drivers/clocksource/timer-microchip-pit64b.c
@@ -71,10 +71,24 @@ struct mchp_pit64b_clkevt {
struct clock_event_device clkevt;
};
-#define to_mchp_pit64b_timer(x) \
+#define clkevt_to_mchp_pit64b_timer(x) \
((struct mchp_pit64b_timer *)container_of(x,\
struct mchp_pit64b_clkevt, clkevt))
+/**
+ * mchp_pit64b_clksrc - PIT64B clocksource data structure
+ * @timer: PIT64B timer
+ * @clksrc: clocksource
+ */
+struct mchp_pit64b_clksrc {
+ struct mchp_pit64b_timer timer;
+ struct clocksource clksrc;
+};
+
+#define clksrc_to_mchp_pit64b_timer(x) \
+ ((struct mchp_pit64b_timer *)container_of(x,\
+ struct mchp_pit64b_clksrc, clksrc))
+
/* Base address for clocksource timer. */
static void __iomem *mchp_pit64b_cs_base;
/* Default cycles for clockevent timer. */
@@ -116,6 +130,36 @@ static inline void mchp_pit64b_reset(struct mchp_pit64b_timer *timer,
writel_relaxed(MCHP_PIT64B_CR_START, timer->base + MCHP_PIT64B_CR);
}
+static void mchp_pit64b_suspend(struct mchp_pit64b_timer *timer)
+{
+ writel_relaxed(MCHP_PIT64B_CR_SWRST, timer->base + MCHP_PIT64B_CR);
+ if (timer->mode & MCHP_PIT64B_MR_SGCLK)
+ clk_disable_unprepare(timer->gclk);
+ clk_disable_unprepare(timer->pclk);
+}
+
+static void mchp_pit64b_resume(struct mchp_pit64b_timer *timer)
+{
+ clk_prepare_enable(timer->pclk);
+ if (timer->mode & MCHP_PIT64B_MR_SGCLK)
+ clk_prepare_enable(timer->gclk);
+}
+
+static void mchp_pit64b_clksrc_suspend(struct clocksource *cs)
+{
+ struct mchp_pit64b_timer *timer = clksrc_to_mchp_pit64b_timer(cs);
+
+ mchp_pit64b_suspend(timer);
+}
+
+static void mchp_pit64b_clksrc_resume(struct clocksource *cs)
+{
+ struct mchp_pit64b_timer *timer = clksrc_to_mchp_pit64b_timer(cs);
+
+ mchp_pit64b_resume(timer);
+ mchp_pit64b_reset(timer, ULLONG_MAX, MCHP_PIT64B_MR_CONT, 0);
+}
+
static u64 mchp_pit64b_clksrc_read(struct clocksource *cs)
{
return mchp_pit64b_cnt_read(mchp_pit64b_cs_base);
@@ -128,7 +172,7 @@ static u64 mchp_pit64b_sched_read_clk(void)
static int mchp_pit64b_clkevt_shutdown(struct clock_event_device *cedev)
{
- struct mchp_pit64b_timer *timer = to_mchp_pit64b_timer(cedev);
+ struct mchp_pit64b_timer *timer = clkevt_to_mchp_pit64b_timer(cedev);
writel_relaxed(MCHP_PIT64B_CR_SWRST, timer->base + MCHP_PIT64B_CR);
@@ -137,7 +181,7 @@ static int mchp_pit64b_clkevt_shutdown(struct clock_event_device *cedev)
static int mchp_pit64b_clkevt_set_periodic(struct clock_event_device *cedev)
{
- struct mchp_pit64b_timer *timer = to_mchp_pit64b_timer(cedev);
+ struct mchp_pit64b_timer *timer = clkevt_to_mchp_pit64b_timer(cedev);
mchp_pit64b_reset(timer, mchp_pit64b_ce_cycles, MCHP_PIT64B_MR_CONT,
MCHP_PIT64B_IER_PERIOD);
@@ -148,7 +192,7 @@ static int mchp_pit64b_clkevt_set_periodic(struct clock_event_device *cedev)
static int mchp_pit64b_clkevt_set_next_event(unsigned long evt,
struct clock_event_device *cedev)
{
- struct mchp_pit64b_timer *timer = to_mchp_pit64b_timer(cedev);
+ struct mchp_pit64b_timer *timer = clkevt_to_mchp_pit64b_timer(cedev);
mchp_pit64b_reset(timer, evt, MCHP_PIT64B_MR_ONE_SHOT,
MCHP_PIT64B_IER_PERIOD);
@@ -158,21 +202,16 @@ static int mchp_pit64b_clkevt_set_next_event(unsigned long evt,
static void mchp_pit64b_clkevt_suspend(struct clock_event_device *cedev)
{
- struct mchp_pit64b_timer *timer = to_mchp_pit64b_timer(cedev);
+ struct mchp_pit64b_timer *timer = clkevt_to_mchp_pit64b_timer(cedev);
- writel_relaxed(MCHP_PIT64B_CR_SWRST, timer->base + MCHP_PIT64B_CR);
- if (timer->mode & MCHP_PIT64B_MR_SGCLK)
- clk_disable_unprepare(timer->gclk);
- clk_disable_unprepare(timer->pclk);
+ mchp_pit64b_suspend(timer);
}
static void mchp_pit64b_clkevt_resume(struct clock_event_device *cedev)
{
- struct mchp_pit64b_timer *timer = to_mchp_pit64b_timer(cedev);
+ struct mchp_pit64b_timer *timer = clkevt_to_mchp_pit64b_timer(cedev);
- clk_prepare_enable(timer->pclk);
- if (timer->mode & MCHP_PIT64B_MR_SGCLK)
- clk_prepare_enable(timer->gclk);
+ mchp_pit64b_resume(timer);
}
static irqreturn_t mchp_pit64b_interrupt(int irq, void *dev_id)
@@ -296,20 +335,37 @@ done:
static int __init mchp_pit64b_init_clksrc(struct mchp_pit64b_timer *timer,
u32 clk_rate)
{
+ struct mchp_pit64b_clksrc *cs;
int ret;
+ cs = kzalloc(sizeof(*cs), GFP_KERNEL);
+ if (!cs)
+ return -ENOMEM;
+
mchp_pit64b_reset(timer, ULLONG_MAX, MCHP_PIT64B_MR_CONT, 0);
mchp_pit64b_cs_base = timer->base;
- ret = clocksource_mmio_init(timer->base, MCHP_PIT64B_NAME, clk_rate,
- 210, 64, mchp_pit64b_clksrc_read);
+ cs->timer.base = timer->base;
+ cs->timer.pclk = timer->pclk;
+ cs->timer.gclk = timer->gclk;
+ cs->timer.mode = timer->mode;
+ cs->clksrc.name = MCHP_PIT64B_NAME;
+ cs->clksrc.mask = CLOCKSOURCE_MASK(64);
+ cs->clksrc.flags = CLOCK_SOURCE_IS_CONTINUOUS;
+ cs->clksrc.rating = 210;
+ cs->clksrc.read = mchp_pit64b_clksrc_read;
+ cs->clksrc.suspend = mchp_pit64b_clksrc_suspend;
+ cs->clksrc.resume = mchp_pit64b_clksrc_resume;
+
+ ret = clocksource_register_hz(&cs->clksrc, clk_rate);
if (ret) {
pr_debug("clksrc: Failed to register PIT64B clocksource!\n");
/* Stop timer. */
writel_relaxed(MCHP_PIT64B_CR_SWRST,
timer->base + MCHP_PIT64B_CR);
+ kfree(cs);
return ret;
}
diff --git a/drivers/clocksource/timer-prima2.c b/drivers/clocksource/timer-prima2.c
deleted file mode 100644
index c5d469342a9d3..0000000000000
--- a/drivers/clocksource/timer-prima2.c
+++ /dev/null
@@ -1,242 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * System timer for CSR SiRFprimaII
- *
- * Copyright (c) 2011 Cambridge Silicon Radio Limited, a CSR plc group company.
- */
-
-#include <linux/kernel.h>
-#include <linux/interrupt.h>
-#include <linux/clockchips.h>
-#include <linux/clocksource.h>
-#include <linux/bitops.h>
-#include <linux/irq.h>
-#include <linux/clk.h>
-#include <linux/err.h>
-#include <linux/slab.h>
-#include <linux/of.h>
-#include <linux/of_irq.h>
-#include <linux/of_address.h>
-#include <linux/sched_clock.h>
-
-#define PRIMA2_CLOCK_FREQ 1000000
-
-#define SIRFSOC_TIMER_COUNTER_LO 0x0000
-#define SIRFSOC_TIMER_COUNTER_HI 0x0004
-#define SIRFSOC_TIMER_MATCH_0 0x0008
-#define SIRFSOC_TIMER_MATCH_1 0x000C
-#define SIRFSOC_TIMER_MATCH_2 0x0010
-#define SIRFSOC_TIMER_MATCH_3 0x0014
-#define SIRFSOC_TIMER_MATCH_4 0x0018
-#define SIRFSOC_TIMER_MATCH_5 0x001C
-#define SIRFSOC_TIMER_STATUS 0x0020
-#define SIRFSOC_TIMER_INT_EN 0x0024
-#define SIRFSOC_TIMER_WATCHDOG_EN 0x0028
-#define SIRFSOC_TIMER_DIV 0x002C
-#define SIRFSOC_TIMER_LATCH 0x0030
-#define SIRFSOC_TIMER_LATCHED_LO 0x0034
-#define SIRFSOC_TIMER_LATCHED_HI 0x0038
-
-#define SIRFSOC_TIMER_WDT_INDEX 5
-
-#define SIRFSOC_TIMER_LATCH_BIT BIT(0)
-
-#define SIRFSOC_TIMER_REG_CNT 11
-
-static const u32 sirfsoc_timer_reg_list[SIRFSOC_TIMER_REG_CNT] = {
- SIRFSOC_TIMER_MATCH_0, SIRFSOC_TIMER_MATCH_1, SIRFSOC_TIMER_MATCH_2,
- SIRFSOC_TIMER_MATCH_3, SIRFSOC_TIMER_MATCH_4, SIRFSOC_TIMER_MATCH_5,
- SIRFSOC_TIMER_INT_EN, SIRFSOC_TIMER_WATCHDOG_EN, SIRFSOC_TIMER_DIV,
- SIRFSOC_TIMER_LATCHED_LO, SIRFSOC_TIMER_LATCHED_HI,
-};
-
-static u32 sirfsoc_timer_reg_val[SIRFSOC_TIMER_REG_CNT];
-
-static void __iomem *sirfsoc_timer_base;
-
-/* timer0 interrupt handler */
-static irqreturn_t sirfsoc_timer_interrupt(int irq, void *dev_id)
-{
- struct clock_event_device *ce = dev_id;
-
- WARN_ON(!(readl_relaxed(sirfsoc_timer_base + SIRFSOC_TIMER_STATUS) &
- BIT(0)));
-
- /* clear timer0 interrupt */
- writel_relaxed(BIT(0), sirfsoc_timer_base + SIRFSOC_TIMER_STATUS);
-
- ce->event_handler(ce);
-
- return IRQ_HANDLED;
-}
-
-/* read 64-bit timer counter */
-static u64 notrace sirfsoc_timer_read(struct clocksource *cs)
-{
- u64 cycles;
-
- /* latch the 64-bit timer counter */
- writel_relaxed(SIRFSOC_TIMER_LATCH_BIT,
- sirfsoc_timer_base + SIRFSOC_TIMER_LATCH);
- cycles = readl_relaxed(sirfsoc_timer_base + SIRFSOC_TIMER_LATCHED_HI);
- cycles = (cycles << 32) |
- readl_relaxed(sirfsoc_timer_base + SIRFSOC_TIMER_LATCHED_LO);
-
- return cycles;
-}
-
-static int sirfsoc_timer_set_next_event(unsigned long delta,
- struct clock_event_device *ce)
-{
- unsigned long now, next;
-
- writel_relaxed(SIRFSOC_TIMER_LATCH_BIT,
- sirfsoc_timer_base + SIRFSOC_TIMER_LATCH);
- now = readl_relaxed(sirfsoc_timer_base + SIRFSOC_TIMER_LATCHED_LO);
- next = now + delta;
- writel_relaxed(next, sirfsoc_timer_base + SIRFSOC_TIMER_MATCH_0);
- writel_relaxed(SIRFSOC_TIMER_LATCH_BIT,
- sirfsoc_timer_base + SIRFSOC_TIMER_LATCH);
- now = readl_relaxed(sirfsoc_timer_base + SIRFSOC_TIMER_LATCHED_LO);
-
- return next - now > delta ? -ETIME : 0;
-}
-
-static int sirfsoc_timer_shutdown(struct clock_event_device *evt)
-{
- u32 val = readl_relaxed(sirfsoc_timer_base + SIRFSOC_TIMER_INT_EN);
-
- writel_relaxed(val & ~BIT(0),
- sirfsoc_timer_base + SIRFSOC_TIMER_INT_EN);
- return 0;
-}
-
-static int sirfsoc_timer_set_oneshot(struct clock_event_device *evt)
-{
- u32 val = readl_relaxed(sirfsoc_timer_base + SIRFSOC_TIMER_INT_EN);
-
- writel_relaxed(val | BIT(0), sirfsoc_timer_base + SIRFSOC_TIMER_INT_EN);
- return 0;
-}
-
-static void sirfsoc_clocksource_suspend(struct clocksource *cs)
-{
- int i;
-
- writel_relaxed(SIRFSOC_TIMER_LATCH_BIT,
- sirfsoc_timer_base + SIRFSOC_TIMER_LATCH);
-
- for (i = 0; i < SIRFSOC_TIMER_REG_CNT; i++)
- sirfsoc_timer_reg_val[i] =
- readl_relaxed(sirfsoc_timer_base +
- sirfsoc_timer_reg_list[i]);
-}
-
-static void sirfsoc_clocksource_resume(struct clocksource *cs)
-{
- int i;
-
- for (i = 0; i < SIRFSOC_TIMER_REG_CNT - 2; i++)
- writel_relaxed(sirfsoc_timer_reg_val[i],
- sirfsoc_timer_base + sirfsoc_timer_reg_list[i]);
-
- writel_relaxed(sirfsoc_timer_reg_val[SIRFSOC_TIMER_REG_CNT - 2],
- sirfsoc_timer_base + SIRFSOC_TIMER_COUNTER_LO);
- writel_relaxed(sirfsoc_timer_reg_val[SIRFSOC_TIMER_REG_CNT - 1],
- sirfsoc_timer_base + SIRFSOC_TIMER_COUNTER_HI);
-}
-
-static struct clock_event_device sirfsoc_clockevent = {
- .name = "sirfsoc_clockevent",
- .rating = 200,
- .features = CLOCK_EVT_FEAT_ONESHOT,
- .set_state_shutdown = sirfsoc_timer_shutdown,
- .set_state_oneshot = sirfsoc_timer_set_oneshot,
- .set_next_event = sirfsoc_timer_set_next_event,
-};
-
-static struct clocksource sirfsoc_clocksource = {
- .name = "sirfsoc_clocksource",
- .rating = 200,
- .mask = CLOCKSOURCE_MASK(64),
- .flags = CLOCK_SOURCE_IS_CONTINUOUS,
- .read = sirfsoc_timer_read,
- .suspend = sirfsoc_clocksource_suspend,
- .resume = sirfsoc_clocksource_resume,
-};
-
-/* Overwrite weak default sched_clock with more precise one */
-static u64 notrace sirfsoc_read_sched_clock(void)
-{
- return sirfsoc_timer_read(NULL);
-}
-
-static void __init sirfsoc_clockevent_init(void)
-{
- sirfsoc_clockevent.cpumask = cpumask_of(0);
- clockevents_config_and_register(&sirfsoc_clockevent, PRIMA2_CLOCK_FREQ,
- 2, -2);
-}
-
-/* initialize the kernel jiffy timer source */
-static int __init sirfsoc_prima2_timer_init(struct device_node *np)
-{
- unsigned long rate;
- unsigned int irq;
- struct clk *clk;
- int ret;
-
- clk = of_clk_get(np, 0);
- if (IS_ERR(clk)) {
- pr_err("Failed to get clock\n");
- return PTR_ERR(clk);
- }
-
- ret = clk_prepare_enable(clk);
- if (ret) {
- pr_err("Failed to enable clock\n");
- return ret;
- }
-
- rate = clk_get_rate(clk);
-
- if (rate < PRIMA2_CLOCK_FREQ || rate % PRIMA2_CLOCK_FREQ) {
- pr_err("Invalid clock rate\n");
- return -EINVAL;
- }
-
- sirfsoc_timer_base = of_iomap(np, 0);
- if (!sirfsoc_timer_base) {
- pr_err("unable to map timer cpu registers\n");
- return -ENXIO;
- }
-
- irq = irq_of_parse_and_map(np, 0);
-
- writel_relaxed(rate / PRIMA2_CLOCK_FREQ / 2 - 1,
- sirfsoc_timer_base + SIRFSOC_TIMER_DIV);
- writel_relaxed(0, sirfsoc_timer_base + SIRFSOC_TIMER_COUNTER_LO);
- writel_relaxed(0, sirfsoc_timer_base + SIRFSOC_TIMER_COUNTER_HI);
- writel_relaxed(BIT(0), sirfsoc_timer_base + SIRFSOC_TIMER_STATUS);
-
- ret = clocksource_register_hz(&sirfsoc_clocksource, PRIMA2_CLOCK_FREQ);
- if (ret) {
- pr_err("Failed to register clocksource\n");
- return ret;
- }
-
- sched_clock_register(sirfsoc_read_sched_clock, 64, PRIMA2_CLOCK_FREQ);
-
- ret = request_irq(irq, sirfsoc_timer_interrupt, IRQF_TIMER,
- "sirfsoc_timer0", &sirfsoc_clockevent);
- if (ret) {
- pr_err("Failed to setup irq\n");
- return ret;
- }
-
- sirfsoc_clockevent_init();
-
- return 0;
-}
-TIMER_OF_DECLARE(sirfsoc_prima2_timer,
- "sirf,prima2-tick", sirfsoc_prima2_timer_init);
diff --git a/drivers/clocksource/timer-tango-xtal.c b/drivers/clocksource/timer-tango-xtal.c
deleted file mode 100644
index 3f94e454ef999..0000000000000
--- a/drivers/clocksource/timer-tango-xtal.c
+++ /dev/null
@@ -1,57 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <linux/clocksource.h>
-#include <linux/sched_clock.h>
-#include <linux/of_address.h>
-#include <linux/printk.h>
-#include <linux/delay.h>
-#include <linux/init.h>
-#include <linux/clk.h>
-
-static void __iomem *xtal_in_cnt;
-static struct delay_timer delay_timer;
-
-static unsigned long notrace read_xtal_counter(void)
-{
- return readl_relaxed(xtal_in_cnt);
-}
-
-static u64 notrace read_sched_clock(void)
-{
- return read_xtal_counter();
-}
-
-static int __init tango_clocksource_init(struct device_node *np)
-{
- struct clk *clk;
- int xtal_freq, ret;
-
- xtal_in_cnt = of_iomap(np, 0);
- if (xtal_in_cnt == NULL) {
- pr_err("%pOF: invalid address\n", np);
- return -ENXIO;
- }
-
- clk = of_clk_get(np, 0);
- if (IS_ERR(clk)) {
- pr_err("%pOF: invalid clock\n", np);
- return PTR_ERR(clk);
- }
-
- xtal_freq = clk_get_rate(clk);
- delay_timer.freq = xtal_freq;
- delay_timer.read_current_timer = read_xtal_counter;
-
- ret = clocksource_mmio_init(xtal_in_cnt, "tango-xtal", xtal_freq, 350,
- 32, clocksource_mmio_readl_up);
- if (ret) {
- pr_err("%pOF: registration failed\n", np);
- return ret;
- }
-
- sched_clock_register(read_sched_clock, 32, xtal_freq);
- register_current_timer_delay(&delay_timer);
-
- return 0;
-}
-
-TIMER_OF_DECLARE(tango, "sigma,tick-counter", tango_clocksource_init);
diff --git a/drivers/clocksource/timer-u300.c b/drivers/clocksource/timer-u300.c
deleted file mode 100644
index 37cba8dfd45fa..0000000000000
--- a/drivers/clocksource/timer-u300.c
+++ /dev/null
@@ -1,457 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright (C) 2007-2009 ST-Ericsson AB
- * Timer COH 901 328, runs the OS timer interrupt.
- * Author: Linus Walleij <linus.walleij@stericsson.com>
- */
-#include <linux/interrupt.h>
-#include <linux/time.h>
-#include <linux/timex.h>
-#include <linux/clockchips.h>
-#include <linux/clocksource.h>
-#include <linux/types.h>
-#include <linux/io.h>
-#include <linux/clk.h>
-#include <linux/err.h>
-#include <linux/irq.h>
-#include <linux/delay.h>
-#include <linux/of_address.h>
-#include <linux/of_irq.h>
-#include <linux/sched_clock.h>
-
-/* Generic stuff */
-#include <asm/mach/map.h>
-#include <asm/mach/time.h>
-
-/*
- * APP side special timer registers
- * This timer contains four timers which can fire an interrupt each.
- * OS (operating system) timer @ 32768 Hz
- * DD (device driver) timer @ 1 kHz
- * GP1 (general purpose 1) timer @ 1MHz
- * GP2 (general purpose 2) timer @ 1MHz
- */
-
-/* Reset OS Timer 32bit (-/W) */
-#define U300_TIMER_APP_ROST (0x0000)
-#define U300_TIMER_APP_ROST_TIMER_RESET (0x00000000)
-/* Enable OS Timer 32bit (-/W) */
-#define U300_TIMER_APP_EOST (0x0004)
-#define U300_TIMER_APP_EOST_TIMER_ENABLE (0x00000000)
-/* Disable OS Timer 32bit (-/W) */
-#define U300_TIMER_APP_DOST (0x0008)
-#define U300_TIMER_APP_DOST_TIMER_DISABLE (0x00000000)
-/* OS Timer Mode Register 32bit (-/W) */
-#define U300_TIMER_APP_SOSTM (0x000c)
-#define U300_TIMER_APP_SOSTM_MODE_CONTINUOUS (0x00000000)
-#define U300_TIMER_APP_SOSTM_MODE_ONE_SHOT (0x00000001)
-/* OS Timer Status Register 32bit (R/-) */
-#define U300_TIMER_APP_OSTS (0x0010)
-#define U300_TIMER_APP_OSTS_TIMER_STATE_MASK (0x0000000F)
-#define U300_TIMER_APP_OSTS_TIMER_STATE_IDLE (0x00000001)
-#define U300_TIMER_APP_OSTS_TIMER_STATE_ACTIVE (0x00000002)
-#define U300_TIMER_APP_OSTS_ENABLE_IND (0x00000010)
-#define U300_TIMER_APP_OSTS_MODE_MASK (0x00000020)
-#define U300_TIMER_APP_OSTS_MODE_CONTINUOUS (0x00000000)
-#define U300_TIMER_APP_OSTS_MODE_ONE_SHOT (0x00000020)
-#define U300_TIMER_APP_OSTS_IRQ_ENABLED_IND (0x00000040)
-#define U300_TIMER_APP_OSTS_IRQ_PENDING_IND (0x00000080)
-/* OS Timer Current Count Register 32bit (R/-) */
-#define U300_TIMER_APP_OSTCC (0x0014)
-/* OS Timer Terminal Count Register 32bit (R/W) */
-#define U300_TIMER_APP_OSTTC (0x0018)
-/* OS Timer Interrupt Enable Register 32bit (-/W) */
-#define U300_TIMER_APP_OSTIE (0x001c)
-#define U300_TIMER_APP_OSTIE_IRQ_DISABLE (0x00000000)
-#define U300_TIMER_APP_OSTIE_IRQ_ENABLE (0x00000001)
-/* OS Timer Interrupt Acknowledge Register 32bit (-/W) */
-#define U300_TIMER_APP_OSTIA (0x0020)
-#define U300_TIMER_APP_OSTIA_IRQ_ACK (0x00000080)
-
-/* Reset DD Timer 32bit (-/W) */
-#define U300_TIMER_APP_RDDT (0x0040)
-#define U300_TIMER_APP_RDDT_TIMER_RESET (0x00000000)
-/* Enable DD Timer 32bit (-/W) */
-#define U300_TIMER_APP_EDDT (0x0044)
-#define U300_TIMER_APP_EDDT_TIMER_ENABLE (0x00000000)
-/* Disable DD Timer 32bit (-/W) */
-#define U300_TIMER_APP_DDDT (0x0048)
-#define U300_TIMER_APP_DDDT_TIMER_DISABLE (0x00000000)
-/* DD Timer Mode Register 32bit (-/W) */
-#define U300_TIMER_APP_SDDTM (0x004c)
-#define U300_TIMER_APP_SDDTM_MODE_CONTINUOUS (0x00000000)
-#define U300_TIMER_APP_SDDTM_MODE_ONE_SHOT (0x00000001)
-/* DD Timer Status Register 32bit (R/-) */
-#define U300_TIMER_APP_DDTS (0x0050)
-#define U300_TIMER_APP_DDTS_TIMER_STATE_MASK (0x0000000F)
-#define U300_TIMER_APP_DDTS_TIMER_STATE_IDLE (0x00000001)
-#define U300_TIMER_APP_DDTS_TIMER_STATE_ACTIVE (0x00000002)
-#define U300_TIMER_APP_DDTS_ENABLE_IND (0x00000010)
-#define U300_TIMER_APP_DDTS_MODE_MASK (0x00000020)
-#define U300_TIMER_APP_DDTS_MODE_CONTINUOUS (0x00000000)
-#define U300_TIMER_APP_DDTS_MODE_ONE_SHOT (0x00000020)
-#define U300_TIMER_APP_DDTS_IRQ_ENABLED_IND (0x00000040)
-#define U300_TIMER_APP_DDTS_IRQ_PENDING_IND (0x00000080)
-/* DD Timer Current Count Register 32bit (R/-) */
-#define U300_TIMER_APP_DDTCC (0x0054)
-/* DD Timer Terminal Count Register 32bit (R/W) */
-#define U300_TIMER_APP_DDTTC (0x0058)
-/* DD Timer Interrupt Enable Register 32bit (-/W) */
-#define U300_TIMER_APP_DDTIE (0x005c)
-#define U300_TIMER_APP_DDTIE_IRQ_DISABLE (0x00000000)
-#define U300_TIMER_APP_DDTIE_IRQ_ENABLE (0x00000001)
-/* DD Timer Interrupt Acknowledge Register 32bit (-/W) */
-#define U300_TIMER_APP_DDTIA (0x0060)
-#define U300_TIMER_APP_DDTIA_IRQ_ACK (0x00000080)
-
-/* Reset GP1 Timer 32bit (-/W) */
-#define U300_TIMER_APP_RGPT1 (0x0080)
-#define U300_TIMER_APP_RGPT1_TIMER_RESET (0x00000000)
-/* Enable GP1 Timer 32bit (-/W) */
-#define U300_TIMER_APP_EGPT1 (0x0084)
-#define U300_TIMER_APP_EGPT1_TIMER_ENABLE (0x00000000)
-/* Disable GP1 Timer 32bit (-/W) */
-#define U300_TIMER_APP_DGPT1 (0x0088)
-#define U300_TIMER_APP_DGPT1_TIMER_DISABLE (0x00000000)
-/* GP1 Timer Mode Register 32bit (-/W) */
-#define U300_TIMER_APP_SGPT1M (0x008c)
-#define U300_TIMER_APP_SGPT1M_MODE_CONTINUOUS (0x00000000)
-#define U300_TIMER_APP_SGPT1M_MODE_ONE_SHOT (0x00000001)
-/* GP1 Timer Status Register 32bit (R/-) */
-#define U300_TIMER_APP_GPT1S (0x0090)
-#define U300_TIMER_APP_GPT1S_TIMER_STATE_MASK (0x0000000F)
-#define U300_TIMER_APP_GPT1S_TIMER_STATE_IDLE (0x00000001)
-#define U300_TIMER_APP_GPT1S_TIMER_STATE_ACTIVE (0x00000002)
-#define U300_TIMER_APP_GPT1S_ENABLE_IND (0x00000010)
-#define U300_TIMER_APP_GPT1S_MODE_MASK (0x00000020)
-#define U300_TIMER_APP_GPT1S_MODE_CONTINUOUS (0x00000000)
-#define U300_TIMER_APP_GPT1S_MODE_ONE_SHOT (0x00000020)
-#define U300_TIMER_APP_GPT1S_IRQ_ENABLED_IND (0x00000040)
-#define U300_TIMER_APP_GPT1S_IRQ_PENDING_IND (0x00000080)
-/* GP1 Timer Current Count Register 32bit (R/-) */
-#define U300_TIMER_APP_GPT1CC (0x0094)
-/* GP1 Timer Terminal Count Register 32bit (R/W) */
-#define U300_TIMER_APP_GPT1TC (0x0098)
-/* GP1 Timer Interrupt Enable Register 32bit (-/W) */
-#define U300_TIMER_APP_GPT1IE (0x009c)
-#define U300_TIMER_APP_GPT1IE_IRQ_DISABLE (0x00000000)
-#define U300_TIMER_APP_GPT1IE_IRQ_ENABLE (0x00000001)
-/* GP1 Timer Interrupt Acknowledge Register 32bit (-/W) */
-#define U300_TIMER_APP_GPT1IA (0x00a0)
-#define U300_TIMER_APP_GPT1IA_IRQ_ACK (0x00000080)
-
-/* Reset GP2 Timer 32bit (-/W) */
-#define U300_TIMER_APP_RGPT2 (0x00c0)
-#define U300_TIMER_APP_RGPT2_TIMER_RESET (0x00000000)
-/* Enable GP2 Timer 32bit (-/W) */
-#define U300_TIMER_APP_EGPT2 (0x00c4)
-#define U300_TIMER_APP_EGPT2_TIMER_ENABLE (0x00000000)
-/* Disable GP2 Timer 32bit (-/W) */
-#define U300_TIMER_APP_DGPT2 (0x00c8)
-#define U300_TIMER_APP_DGPT2_TIMER_DISABLE (0x00000000)
-/* GP2 Timer Mode Register 32bit (-/W) */
-#define U300_TIMER_APP_SGPT2M (0x00cc)
-#define U300_TIMER_APP_SGPT2M_MODE_CONTINUOUS (0x00000000)
-#define U300_TIMER_APP_SGPT2M_MODE_ONE_SHOT (0x00000001)
-/* GP2 Timer Status Register 32bit (R/-) */
-#define U300_TIMER_APP_GPT2S (0x00d0)
-#define U300_TIMER_APP_GPT2S_TIMER_STATE_MASK (0x0000000F)
-#define U300_TIMER_APP_GPT2S_TIMER_STATE_IDLE (0x00000001)
-#define U300_TIMER_APP_GPT2S_TIMER_STATE_ACTIVE (0x00000002)
-#define U300_TIMER_APP_GPT2S_ENABLE_IND (0x00000010)
-#define U300_TIMER_APP_GPT2S_MODE_MASK (0x00000020)
-#define U300_TIMER_APP_GPT2S_MODE_CONTINUOUS (0x00000000)
-#define U300_TIMER_APP_GPT2S_MODE_ONE_SHOT (0x00000020)
-#define U300_TIMER_APP_GPT2S_IRQ_ENABLED_IND (0x00000040)
-#define U300_TIMER_APP_GPT2S_IRQ_PENDING_IND (0x00000080)
-/* GP2 Timer Current Count Register 32bit (R/-) */
-#define U300_TIMER_APP_GPT2CC (0x00d4)
-/* GP2 Timer Terminal Count Register 32bit (R/W) */
-#define U300_TIMER_APP_GPT2TC (0x00d8)
-/* GP2 Timer Interrupt Enable Register 32bit (-/W) */
-#define U300_TIMER_APP_GPT2IE (0x00dc)
-#define U300_TIMER_APP_GPT2IE_IRQ_DISABLE (0x00000000)
-#define U300_TIMER_APP_GPT2IE_IRQ_ENABLE (0x00000001)
-/* GP2 Timer Interrupt Acknowledge Register 32bit (-/W) */
-#define U300_TIMER_APP_GPT2IA (0x00e0)
-#define U300_TIMER_APP_GPT2IA_IRQ_ACK (0x00000080)
-
-/* Clock request control register - all four timers */
-#define U300_TIMER_APP_CRC (0x100)
-#define U300_TIMER_APP_CRC_CLOCK_REQUEST_ENABLE (0x00000001)
-
-static void __iomem *u300_timer_base;
-
-struct u300_clockevent_data {
- struct clock_event_device cevd;
- unsigned ticks_per_jiffy;
-};
-
-static int u300_shutdown(struct clock_event_device *evt)
-{
- /* Disable interrupts on GP1 */
- writel(U300_TIMER_APP_GPT1IE_IRQ_DISABLE,
- u300_timer_base + U300_TIMER_APP_GPT1IE);
- /* Disable GP1 */
- writel(U300_TIMER_APP_DGPT1_TIMER_DISABLE,
- u300_timer_base + U300_TIMER_APP_DGPT1);
- return 0;
-}
-
-/*
- * If we have oneshot timer active, the oneshot scheduling function
- * u300_set_next_event() is called immediately after.
- */
-static int u300_set_oneshot(struct clock_event_device *evt)
-{
- /* Just return; here? */
- /*
- * The actual event will be programmed by the next event hook,
- * so we just set a dummy value somewhere at the end of the
- * universe here.
- */
- /* Disable interrupts on GPT1 */
- writel(U300_TIMER_APP_GPT1IE_IRQ_DISABLE,
- u300_timer_base + U300_TIMER_APP_GPT1IE);
- /* Disable GP1 while we're reprogramming it. */
- writel(U300_TIMER_APP_DGPT1_TIMER_DISABLE,
- u300_timer_base + U300_TIMER_APP_DGPT1);
- /*
- * Expire far in the future, u300_set_next_event() will be
- * called soon...
- */
- writel(0xFFFFFFFF, u300_timer_base + U300_TIMER_APP_GPT1TC);
- /* We run one shot per tick here! */
- writel(U300_TIMER_APP_SGPT1M_MODE_ONE_SHOT,
- u300_timer_base + U300_TIMER_APP_SGPT1M);
- /* Enable interrupts for this timer */
- writel(U300_TIMER_APP_GPT1IE_IRQ_ENABLE,
- u300_timer_base + U300_TIMER_APP_GPT1IE);
- /* Enable timer */
- writel(U300_TIMER_APP_EGPT1_TIMER_ENABLE,
- u300_timer_base + U300_TIMER_APP_EGPT1);
- return 0;
-}
-
-static int u300_set_periodic(struct clock_event_device *evt)
-{
- struct u300_clockevent_data *cevdata =
- container_of(evt, struct u300_clockevent_data, cevd);
-
- /* Disable interrupts on GPT1 */
- writel(U300_TIMER_APP_GPT1IE_IRQ_DISABLE,
- u300_timer_base + U300_TIMER_APP_GPT1IE);
- /* Disable GP1 while we're reprogramming it. */
- writel(U300_TIMER_APP_DGPT1_TIMER_DISABLE,
- u300_timer_base + U300_TIMER_APP_DGPT1);
- /*
- * Set the periodic mode to a certain number of ticks per
- * jiffy.
- */
- writel(cevdata->ticks_per_jiffy,
- u300_timer_base + U300_TIMER_APP_GPT1TC);
- /*
- * Set continuous mode, so the timer keeps triggering
- * interrupts.
- */
- writel(U300_TIMER_APP_SGPT1M_MODE_CONTINUOUS,
- u300_timer_base + U300_TIMER_APP_SGPT1M);
- /* Enable timer interrupts */
- writel(U300_TIMER_APP_GPT1IE_IRQ_ENABLE,
- u300_timer_base + U300_TIMER_APP_GPT1IE);
- /* Then enable the OS timer again */
- writel(U300_TIMER_APP_EGPT1_TIMER_ENABLE,
- u300_timer_base + U300_TIMER_APP_EGPT1);
- return 0;
-}
-
-/*
- * The app timer in one shot mode obviously has to be reprogrammed
- * in EXACTLY this sequence to work properly. Do NOT try to e.g. replace
- * the interrupt disable + timer disable commands with a reset command,
- * it will fail miserably. Apparently (and I found this the hard way)
- * the timer is very sensitive to the instruction order, though you don't
- * get that impression from the data sheet.
- */
-static int u300_set_next_event(unsigned long cycles,
- struct clock_event_device *evt)
-
-{
- /* Disable interrupts on GPT1 */
- writel(U300_TIMER_APP_GPT1IE_IRQ_DISABLE,
- u300_timer_base + U300_TIMER_APP_GPT1IE);
- /* Disable GP1 while we're reprogramming it. */
- writel(U300_TIMER_APP_DGPT1_TIMER_DISABLE,
- u300_timer_base + U300_TIMER_APP_DGPT1);
- /* Reset the General Purpose timer 1. */
- writel(U300_TIMER_APP_RGPT1_TIMER_RESET,
- u300_timer_base + U300_TIMER_APP_RGPT1);
- /* IRQ in n * cycles */
- writel(cycles, u300_timer_base + U300_TIMER_APP_GPT1TC);
- /*
- * We run one shot per tick here! (This is necessary to reconfigure,
- * the timer will tilt if you don't!)
- */
- writel(U300_TIMER_APP_SGPT1M_MODE_ONE_SHOT,
- u300_timer_base + U300_TIMER_APP_SGPT1M);
- /* Enable timer interrupts */
- writel(U300_TIMER_APP_GPT1IE_IRQ_ENABLE,
- u300_timer_base + U300_TIMER_APP_GPT1IE);
- /* Then enable the OS timer again */
- writel(U300_TIMER_APP_EGPT1_TIMER_ENABLE,
- u300_timer_base + U300_TIMER_APP_EGPT1);
- return 0;
-}
-
-static struct u300_clockevent_data u300_clockevent_data = {
- /* Use general purpose timer 1 as clock event */
- .cevd = {
- .name = "GPT1",
- /* Reasonably fast and accurate clock event */
- .rating = 300,
- .features = CLOCK_EVT_FEAT_PERIODIC |
- CLOCK_EVT_FEAT_ONESHOT,
- .set_next_event = u300_set_next_event,
- .set_state_shutdown = u300_shutdown,
- .set_state_periodic = u300_set_periodic,
- .set_state_oneshot = u300_set_oneshot,
- },
-};
-
-/* Clock event timer interrupt handler */
-static irqreturn_t u300_timer_interrupt(int irq, void *dev_id)
-{
- struct clock_event_device *evt = &u300_clockevent_data.cevd;
- /* ACK/Clear timer IRQ for the APP GPT1 Timer */
-
- writel(U300_TIMER_APP_GPT1IA_IRQ_ACK,
- u300_timer_base + U300_TIMER_APP_GPT1IA);
- evt->event_handler(evt);
- return IRQ_HANDLED;
-}
-
-/*
- * Override the global weak sched_clock symbol with this
- * local implementation which uses the clocksource to get some
- * better resolution when scheduling the kernel. We accept that
- * this wraps around for now, since it is just a relative time
- * stamp. (Inspired by OMAP implementation.)
- */
-
-static u64 notrace u300_read_sched_clock(void)
-{
- return readl(u300_timer_base + U300_TIMER_APP_GPT2CC);
-}
-
-static unsigned long u300_read_current_timer(void)
-{
- return readl(u300_timer_base + U300_TIMER_APP_GPT2CC);
-}
-
-static struct delay_timer u300_delay_timer;
-
-/*
- * This sets up the system timers, clock source and clock event.
- */
-static int __init u300_timer_init_of(struct device_node *np)
-{
- unsigned int irq;
- struct clk *clk;
- unsigned long rate;
- int ret;
-
- u300_timer_base = of_iomap(np, 0);
- if (!u300_timer_base) {
- pr_err("could not ioremap system timer\n");
- return -ENXIO;
- }
-
- /* Get the IRQ for the GP1 timer */
- irq = irq_of_parse_and_map(np, 2);
- if (!irq) {
- pr_err("no IRQ for system timer\n");
- return -EINVAL;
- }
-
- pr_info("U300 GP1 timer @ base: %p, IRQ: %u\n", u300_timer_base, irq);
-
- /* Clock the interrupt controller */
- clk = of_clk_get(np, 0);
- if (IS_ERR(clk))
- return PTR_ERR(clk);
-
- ret = clk_prepare_enable(clk);
- if (ret)
- return ret;
-
- rate = clk_get_rate(clk);
-
- u300_clockevent_data.ticks_per_jiffy = DIV_ROUND_CLOSEST(rate, HZ);
-
- sched_clock_register(u300_read_sched_clock, 32, rate);
-
- u300_delay_timer.read_current_timer = &u300_read_current_timer;
- u300_delay_timer.freq = rate;
- register_current_timer_delay(&u300_delay_timer);
-
- /*
- * Disable the "OS" and "DD" timers - these are designed for Symbian!
- * Example usage in cnh1601578 cpu subsystem pd_timer_app.c
- */
- writel(U300_TIMER_APP_CRC_CLOCK_REQUEST_ENABLE,
- u300_timer_base + U300_TIMER_APP_CRC);
- writel(U300_TIMER_APP_ROST_TIMER_RESET,
- u300_timer_base + U300_TIMER_APP_ROST);
- writel(U300_TIMER_APP_DOST_TIMER_DISABLE,
- u300_timer_base + U300_TIMER_APP_DOST);
- writel(U300_TIMER_APP_RDDT_TIMER_RESET,
- u300_timer_base + U300_TIMER_APP_RDDT);
- writel(U300_TIMER_APP_DDDT_TIMER_DISABLE,
- u300_timer_base + U300_TIMER_APP_DDDT);
-
- /* Reset the General Purpose timer 1. */
- writel(U300_TIMER_APP_RGPT1_TIMER_RESET,
- u300_timer_base + U300_TIMER_APP_RGPT1);
-
- /* Set up the IRQ handler */
- ret = request_irq(irq, u300_timer_interrupt,
- IRQF_TIMER | IRQF_IRQPOLL, "U300 Timer Tick", NULL);
- if (ret)
- return ret;
-
- /* Reset the General Purpose timer 2 */
- writel(U300_TIMER_APP_RGPT2_TIMER_RESET,
- u300_timer_base + U300_TIMER_APP_RGPT2);
- /* Set this timer to run around forever */
- writel(0xFFFFFFFFU, u300_timer_base + U300_TIMER_APP_GPT2TC);
- /* Set continuous mode so it wraps around */
- writel(U300_TIMER_APP_SGPT2M_MODE_CONTINUOUS,
- u300_timer_base + U300_TIMER_APP_SGPT2M);
- /* Disable timer interrupts */
- writel(U300_TIMER_APP_GPT2IE_IRQ_DISABLE,
- u300_timer_base + U300_TIMER_APP_GPT2IE);
- /* Then enable the GP2 timer to use as a free running us counter */
- writel(U300_TIMER_APP_EGPT2_TIMER_ENABLE,
- u300_timer_base + U300_TIMER_APP_EGPT2);
-
- /* Use general purpose timer 2 as clock source */
- ret = clocksource_mmio_init(u300_timer_base + U300_TIMER_APP_GPT2CC,
- "GPT2", rate, 300, 32, clocksource_mmio_readl_up);
- if (ret) {
- pr_err("timer: failed to initialize U300 clock source\n");
- return ret;
- }
-
- /* Configure and register the clockevent */
- clockevents_config_and_register(&u300_clockevent_data.cevd, rate,
- 1, 0xffffffff);
-
- /*
- * TODO: init and register the rest of the timers too, they can be
- * used by hrtimers!
- */
- return 0;
-}
-
-TIMER_OF_DECLARE(u300_timer, "stericsson,u300-apptimer",
- u300_timer_init_of);
diff --git a/drivers/firmware/efi/libstub/arm64-stub.c b/drivers/firmware/efi/libstub/arm64-stub.c
index 22ece1ad68a8f..b69d63143e0d8 100644
--- a/drivers/firmware/efi/libstub/arm64-stub.c
+++ b/drivers/firmware/efi/libstub/arm64-stub.c
@@ -61,10 +61,10 @@ efi_status_t handle_kernel_image(unsigned long *image_addr,
status = efi_get_random_bytes(sizeof(phys_seed),
(u8 *)&phys_seed);
if (status == EFI_NOT_FOUND) {
- efi_info("EFI_RNG_PROTOCOL unavailable, KASLR will be disabled\n");
+ efi_info("EFI_RNG_PROTOCOL unavailable\n");
efi_nokaslr = true;
} else if (status != EFI_SUCCESS) {
- efi_err("efi_get_random_bytes() failed (0x%lx), KASLR will be disabled\n",
+ efi_err("efi_get_random_bytes() failed (0x%lx)\n",
status);
efi_nokaslr = true;
}
diff --git a/drivers/firmware/efi/libstub/efistub.h b/drivers/firmware/efi/libstub/efistub.h
index b50a6c67d9bd2..cde0a2ef507d9 100644
--- a/drivers/firmware/efi/libstub/efistub.h
+++ b/drivers/firmware/efi/libstub/efistub.h
@@ -672,7 +672,7 @@ typedef union efi_tcg2_protocol efi_tcg2_protocol_t;
union efi_tcg2_protocol {
struct {
void *get_capability;
- efi_status_t (__efiapi *get_event_log)(efi_handle_t,
+ efi_status_t (__efiapi *get_event_log)(efi_tcg2_protocol_t *,
efi_tcg2_event_log_format,
efi_physical_addr_t *,
efi_physical_addr_t *,
@@ -849,4 +849,13 @@ void efi_handle_post_ebs_state(void);
enum efi_secureboot_mode efi_get_secureboot(void);
+#ifdef CONFIG_RESET_ATTACK_MITIGATION
+void efi_enable_reset_attack_mitigation(void);
+#else
+static inline void
+efi_enable_reset_attack_mitigation(void) { }
+#endif
+
+void efi_retrieve_tpm2_eventlog(void);
+
#endif
diff --git a/drivers/s390/char/sclp_early_core.c b/drivers/s390/char/sclp_early_core.c
index ec9f8ad5341c5..b7329af076a0f 100644
--- a/drivers/s390/char/sclp_early_core.c
+++ b/drivers/s390/char/sclp_early_core.c
@@ -66,13 +66,13 @@ int sclp_early_cmd(sclp_cmdw_t cmd, void *sccb)
unsigned long flags;
int rc;
- raw_local_irq_save(flags);
+ flags = arch_local_irq_save();
rc = sclp_service_call(cmd, sccb);
if (rc)
goto out;
sclp_early_wait_irq();
out:
- raw_local_irq_restore(flags);
+ arch_local_irq_restore(flags);
return rc;
}
diff --git a/drivers/thermal/cpufreq_cooling.c b/drivers/thermal/cpufreq_cooling.c
index 612f063c1cfcd..f5af2571f9b71 100644
--- a/drivers/thermal/cpufreq_cooling.c
+++ b/drivers/thermal/cpufreq_cooling.c
@@ -76,7 +76,9 @@ struct cpufreq_cooling_device {
struct em_perf_domain *em;
struct cpufreq_policy *policy;
struct list_head node;
+#ifndef CONFIG_SMP
struct time_in_idle *idle_time;
+#endif
struct freq_qos_request qos_req;
};
@@ -132,14 +134,25 @@ static u32 cpu_power_to_freq(struct cpufreq_cooling_device *cpufreq_cdev,
}
/**
- * get_load() - get load for a cpu since last updated
- * @cpufreq_cdev: &struct cpufreq_cooling_device for this cpu
- * @cpu: cpu number
- * @cpu_idx: index of the cpu in time_in_idle*
+ * get_load() - get load for a cpu
+ * @cpufreq_cdev: struct cpufreq_cooling_device for the cpu
+ * @cpu: cpu number
+ * @cpu_idx: index of the cpu in time_in_idle array
*
* Return: The average load of cpu @cpu in percentage since this
* function was last called.
*/
+#ifdef CONFIG_SMP
+static u32 get_load(struct cpufreq_cooling_device *cpufreq_cdev, int cpu,
+ int cpu_idx)
+{
+ unsigned long max = arch_scale_cpu_capacity(cpu);
+ unsigned long util;
+
+ util = sched_cpu_util(cpu, max);
+ return (util * 100) / max;
+}
+#else /* !CONFIG_SMP */
static u32 get_load(struct cpufreq_cooling_device *cpufreq_cdev, int cpu,
int cpu_idx)
{
@@ -161,6 +174,7 @@ static u32 get_load(struct cpufreq_cooling_device *cpufreq_cdev, int cpu,
return load;
}
+#endif /* CONFIG_SMP */
/**
* get_dynamic_power() - calculate the dynamic power
@@ -346,6 +360,36 @@ static inline bool em_is_sane(struct cpufreq_cooling_device *cpufreq_cdev,
}
#endif /* CONFIG_THERMAL_GOV_POWER_ALLOCATOR */
+#ifdef CONFIG_SMP
+static inline int allocate_idle_time(struct cpufreq_cooling_device *cpufreq_cdev)
+{
+ return 0;
+}
+
+static inline void free_idle_time(struct cpufreq_cooling_device *cpufreq_cdev)
+{
+}
+#else
+static int allocate_idle_time(struct cpufreq_cooling_device *cpufreq_cdev)
+{
+ unsigned int num_cpus = cpumask_weight(cpufreq_cdev->policy->related_cpus);
+
+ cpufreq_cdev->idle_time = kcalloc(num_cpus,
+ sizeof(*cpufreq_cdev->idle_time),
+ GFP_KERNEL);
+ if (!cpufreq_cdev->idle_time)
+ return -ENOMEM;
+
+ return 0;
+}
+
+static void free_idle_time(struct cpufreq_cooling_device *cpufreq_cdev)
+{
+ kfree(cpufreq_cdev->idle_time);
+ cpufreq_cdev->idle_time = NULL;
+}
+#endif /* CONFIG_SMP */
+
static unsigned int get_state_freq(struct cpufreq_cooling_device *cpufreq_cdev,
unsigned long state)
{
@@ -485,7 +529,7 @@ __cpufreq_cooling_register(struct device_node *np,
struct thermal_cooling_device *cdev;
struct cpufreq_cooling_device *cpufreq_cdev;
char dev_name[THERMAL_NAME_LENGTH];
- unsigned int i, num_cpus;
+ unsigned int i;
struct device *dev;
int ret;
struct thermal_cooling_device_ops *cooling_ops;
@@ -496,7 +540,6 @@ __cpufreq_cooling_register(struct device_node *np,
return ERR_PTR(-ENODEV);
}
-
if (IS_ERR_OR_NULL(policy)) {
pr_err("%s: cpufreq policy isn't valid: %p\n", __func__, policy);
return ERR_PTR(-EINVAL);
@@ -514,12 +557,10 @@ __cpufreq_cooling_register(struct device_node *np,
return ERR_PTR(-ENOMEM);
cpufreq_cdev->policy = policy;
- num_cpus = cpumask_weight(policy->related_cpus);
- cpufreq_cdev->idle_time = kcalloc(num_cpus,
- sizeof(*cpufreq_cdev->idle_time),
- GFP_KERNEL);
- if (!cpufreq_cdev->idle_time) {
- cdev = ERR_PTR(-ENOMEM);
+
+ ret = allocate_idle_time(cpufreq_cdev);
+ if (ret) {
+ cdev = ERR_PTR(ret);
goto free_cdev;
}
@@ -579,7 +620,7 @@ remove_qos_req:
remove_ida:
ida_simple_remove(&cpufreq_ida, cpufreq_cdev->id);
free_idle_time:
- kfree(cpufreq_cdev->idle_time);
+ free_idle_time(cpufreq_cdev);
free_cdev:
kfree(cpufreq_cdev);
return cdev;
@@ -672,7 +713,7 @@ void cpufreq_cooling_unregister(struct thermal_cooling_device *cdev)
thermal_cooling_device_unregister(cdev);
freq_qos_remove_request(&cpufreq_cdev->qos_req);
ida_simple_remove(&cpufreq_ida, cpufreq_cdev->id);
- kfree(cpufreq_cdev->idle_time);
+ free_idle_time(cpufreq_cdev);
kfree(cpufreq_cdev);
}
EXPORT_SYMBOL_GPL(cpufreq_cooling_unregister);
diff --git a/drivers/thermal/intel/Kconfig b/drivers/thermal/intel/Kconfig
index 8025b21f43fa5..ce4f59213c7ab 100644
--- a/drivers/thermal/intel/Kconfig
+++ b/drivers/thermal/intel/Kconfig
@@ -8,6 +8,10 @@ config INTEL_POWERCLAMP
enforce idle time which results in more package C-state residency. The
user interface is exposed via generic thermal framework.
+config X86_THERMAL_VECTOR
+ def_bool y
+ depends on X86 && CPU_SUP_INTEL && X86_LOCAL_APIC
+
config X86_PKG_TEMP_THERMAL
tristate "X86 package temperature thermal driver"
depends on X86_THERMAL_VECTOR
diff --git a/drivers/thermal/intel/Makefile b/drivers/thermal/intel/Makefile
index 0d9736ced5d4e..ff2ad30ef3977 100644
--- a/drivers/thermal/intel/Makefile
+++ b/drivers/thermal/intel/Makefile
@@ -10,3 +10,4 @@ obj-$(CONFIG_INTEL_QUARK_DTS_THERMAL) += intel_quark_dts_thermal.o
obj-$(CONFIG_INT340X_THERMAL) += int340x_thermal/
obj-$(CONFIG_INTEL_BXT_PMIC_THERMAL) += intel_bxt_pmic_thermal.o
obj-$(CONFIG_INTEL_PCH_THERMAL) += intel_pch_thermal.o
+obj-$(CONFIG_X86_THERMAL_VECTOR) += therm_throt.o
diff --git a/arch/x86/kernel/cpu/mce/therm_throt.c b/drivers/thermal/intel/therm_throt.c
index a7cd2d203ceda..f8e882592ba5d 100644
--- a/arch/x86/kernel/cpu/mce/therm_throt.c
+++ b/drivers/thermal/intel/therm_throt.c
@@ -26,13 +26,13 @@
#include <linux/cpu.h>
#include <asm/processor.h>
+#include <asm/thermal.h>
#include <asm/traps.h>
#include <asm/apic.h>
-#include <asm/mce.h>
+#include <asm/irq.h>
#include <asm/msr.h>
-#include <asm/trace/irq_vectors.h>
-#include "internal.h"
+#include "thermal_interrupt.h"
/* How long to wait between reporting thermal events */
#define CHECK_INTERVAL (300 * HZ)
@@ -570,7 +570,7 @@ static void notify_thresholds(__u64 msr_val)
}
/* Thermal transition interrupt handler */
-static void intel_thermal_interrupt(void)
+void intel_thermal_interrupt(void)
{
__u64 msr_val;
@@ -606,23 +606,6 @@ static void intel_thermal_interrupt(void)
}
}
-static void unexpected_thermal_interrupt(void)
-{
- pr_err("CPU%d: Unexpected LVT thermal interrupt!\n",
- smp_processor_id());
-}
-
-static void (*smp_thermal_vector)(void) = unexpected_thermal_interrupt;
-
-DEFINE_IDTENTRY_SYSVEC(sysvec_thermal)
-{
- trace_thermal_apic_entry(THERMAL_APIC_VECTOR);
- inc_irq_stat(irq_thermal_count);
- smp_thermal_vector();
- trace_thermal_apic_exit(THERMAL_APIC_VECTOR);
- ack_APIC_irq();
-}
-
/* Thermal monitoring depends on APIC, ACPI and clock modulation */
static int intel_thermal_supported(struct cpuinfo_x86 *c)
{
@@ -633,15 +616,9 @@ static int intel_thermal_supported(struct cpuinfo_x86 *c)
return 1;
}
-void __init mcheck_intel_therm_init(void)
+bool x86_thermal_enabled(void)
{
- /*
- * This function is only called on boot CPU. Save the init thermal
- * LVT value on BSP and use that value to restore APs' thermal LVT
- * entry BIOS programmed later
- */
- if (intel_thermal_supported(&boot_cpu_data))
- lvtthmr_init = apic_read(APIC_LVTTHMR);
+ return atomic_read(&therm_throt_en);
}
void intel_init_thermal(struct cpuinfo_x86 *c)
@@ -653,6 +630,10 @@ void intel_init_thermal(struct cpuinfo_x86 *c)
if (!intel_thermal_supported(c))
return;
+ /* On the BSP? */
+ if (c == &boot_cpu_data)
+ lvtthmr_init = apic_read(APIC_LVTTHMR);
+
/*
* First check if its enabled already, in which case there might
* be some SMM goo which handles it, so we can't even put a handler
@@ -726,8 +707,6 @@ void intel_init_thermal(struct cpuinfo_x86 *c)
| PACKAGE_THERM_INT_HIGH_ENABLE), h);
}
- smp_thermal_vector = intel_thermal_interrupt;
-
rdmsr(MSR_IA32_MISC_ENABLE, l, h);
wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h);
diff --git a/drivers/thermal/intel/thermal_interrupt.h b/drivers/thermal/intel/thermal_interrupt.h
new file mode 100644
index 0000000000000..53f427bb58dce
--- /dev/null
+++ b/drivers/thermal/intel/thermal_interrupt.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _INTEL_THERMAL_INTERRUPT_H
+#define _INTEL_THERMAL_INTERRUPT_H
+
+/* Interrupt Handler for package thermal thresholds */
+extern int (*platform_thermal_package_notify)(__u64 msr_val);
+
+/* Interrupt Handler for core thermal thresholds */
+extern int (*platform_thermal_notify)(__u64 msr_val);
+
+/* Callback support of rate control, return true, if
+ * callback has rate control */
+extern bool (*platform_thermal_package_rate_control)(void);
+
+#endif /* _INTEL_THERMAL_INTERRUPT_H */
diff --git a/drivers/thermal/intel/x86_pkg_temp_thermal.c b/drivers/thermal/intel/x86_pkg_temp_thermal.c
index b81c33202f41a..295742e839602 100644
--- a/drivers/thermal/intel/x86_pkg_temp_thermal.c
+++ b/drivers/thermal/intel/x86_pkg_temp_thermal.c
@@ -17,8 +17,10 @@
#include <linux/pm.h>
#include <linux/thermal.h>
#include <linux/debugfs.h>
+
#include <asm/cpu_device_id.h>
-#include <asm/mce.h>
+
+#include "thermal_interrupt.h"
/*
* Rate control delay: Idea is to introduce denounce effect
diff --git a/fs/exec.c b/fs/exec.c
index 48d1e8b1610bc..6f3c02066ce32 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -708,7 +708,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
return -ENOMEM;
lru_add_drain();
- tlb_gather_mmu(&tlb, mm, old_start, old_end);
+ tlb_gather_mmu(&tlb, mm);
if (new_end > old_start) {
/*
* when the old and new regions overlap clear from new_end.
@@ -725,7 +725,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
free_pgd_range(&tlb, old_start, old_end, new_end,
vma->vm_next ? vma->vm_next->vm_start : USER_PGTABLES_CEILING);
}
- tlb_finish_mmu(&tlb, old_start, old_end);
+ tlb_finish_mmu(&tlb);
/*
* Shrink the vma to just the new range. Always succeeds.
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 602e3a52884d8..3cec6fbef725e 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1210,7 +1210,6 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
struct mm_struct *mm;
struct vm_area_struct *vma;
enum clear_refs_types type;
- struct mmu_gather tlb;
int itype;
int rv;
@@ -1249,7 +1248,6 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
goto out_unlock;
}
- tlb_gather_mmu(&tlb, mm, 0, -1);
if (type == CLEAR_REFS_SOFT_DIRTY) {
for (vma = mm->mmap; vma; vma = vma->vm_next) {
if (!(vma->vm_flags & VM_SOFTDIRTY))
@@ -1258,15 +1256,18 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
vma_set_page_prot(vma);
}
+ inc_tlb_flush_pending(mm);
mmu_notifier_range_init(&range, MMU_NOTIFY_SOFT_DIRTY,
0, NULL, mm, 0, -1UL);
mmu_notifier_invalidate_range_start(&range);
}
walk_page_range(mm, 0, mm->highest_vm_end, &clear_refs_walk_ops,
&cp);
- if (type == CLEAR_REFS_SOFT_DIRTY)
+ if (type == CLEAR_REFS_SOFT_DIRTY) {
mmu_notifier_invalidate_range_end(&range);
- tlb_finish_mmu(&tlb, 0, -1);
+ flush_tlb_mm(mm);
+ dec_tlb_flush_pending(mm);
+ }
out_unlock:
mmap_write_unlock(mm);
out_mm:
diff --git a/include/asm-generic/Kbuild b/include/asm-generic/Kbuild
index 7282c0f50c856..302506bbc2a4f 100644
--- a/include/asm-generic/Kbuild
+++ b/include/asm-generic/Kbuild
@@ -50,6 +50,7 @@ mandatory-y += sections.h
mandatory-y += serial.h
mandatory-y += shmparam.h
mandatory-y += simd.h
+mandatory-y += softirq_stack.h
mandatory-y += switch_to.h
mandatory-y += timex.h
mandatory-y += tlbflush.h
diff --git a/include/asm-generic/softirq_stack.h b/include/asm-generic/softirq_stack.h
new file mode 100644
index 0000000000000..eceeecf6a5bd8
--- /dev/null
+++ b/include/asm-generic/softirq_stack.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef __ASM_GENERIC_SOFTIRQ_STACK_H
+#define __ASM_GENERIC_SOFTIRQ_STACK_H
+
+#ifdef CONFIG_HAVE_SOFTIRQ_ON_OWN_STACK
+void do_softirq_own_stack(void);
+#else
+static inline void do_softirq_own_stack(void)
+{
+ __do_softirq();
+}
+#endif
+
+#endif
diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index 6661ee1cff479..2c68a545ffa7d 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -46,7 +46,9 @@
*
* The mmu_gather API consists of:
*
- * - tlb_gather_mmu() / tlb_finish_mmu(); start and finish a mmu_gather
+ * - tlb_gather_mmu() / tlb_gather_mmu_fullmm() / tlb_finish_mmu()
+ *
+ * start and finish a mmu_gather
*
* Finish in particular will issue a (final) TLB invalidate and free
* all (remaining) queued pages.
@@ -91,7 +93,7 @@
*
* - mmu_gather::fullmm
*
- * A flag set by tlb_gather_mmu() to indicate we're going to free
+ * A flag set by tlb_gather_mmu_fullmm() to indicate we're going to free
* the entire mm; this allows a number of optimizations.
*
* - We can ignore tlb_{start,end}_vma(); because we don't
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 02ca2c755dfd9..4085133ec2e61 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -403,7 +403,10 @@
. = ALIGN(8); \
__start_static_call_sites = .; \
KEEP(*(.static_call_sites)) \
- __stop_static_call_sites = .;
+ __stop_static_call_sites = .; \
+ __start_static_call_tramp_key = .; \
+ KEEP(*(.static_call_tramp_key)) \
+ __stop_static_call_tramp_key = .;
/*
* Allow architectures to handle ro_after_init data on their
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 451c2d26a5db8..4f2f79de083e2 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -307,7 +307,7 @@ void css_task_iter_end(struct css_task_iter *it);
* Inline functions.
*/
-static inline u64 cgroup_id(struct cgroup *cgrp)
+static inline u64 cgroup_id(const struct cgroup *cgrp)
{
return cgrp->kn->id;
}
@@ -701,7 +701,7 @@ void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen);
struct cgroup_subsys_state;
struct cgroup;
-static inline u64 cgroup_id(struct cgroup *cgrp) { return 1; }
+static inline u64 cgroup_id(const struct cgroup *cgrp) { return 1; }
static inline void css_get(struct cgroup_subsys_state *css) {}
static inline void css_put(struct cgroup_subsys_state *css) {}
static inline int cgroup_attach_task_all(struct task_struct *from,
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index b8fe0c23cfffb..df5b405e63051 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -76,6 +76,8 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val,
#else
# define likely(x) __builtin_expect(!!(x), 1)
# define unlikely(x) __builtin_expect(!!(x), 0)
+# define likely_notrace(x) likely(x)
+# define unlikely_notrace(x) unlikely(x)
#endif
/* Optimization barrier */
diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index d6428aaf67e73..3aaa0687e8df6 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -111,6 +111,8 @@ static inline void cpu_maps_update_done(void)
#endif /* CONFIG_SMP */
extern struct bus_type cpu_subsys;
+extern int lockdep_is_cpus_held(void);
+
#ifdef CONFIG_HOTPLUG_CPU
extern void cpus_write_lock(void);
extern void cpus_write_unlock(void);
diff --git a/include/linux/efi.h b/include/linux/efi.h
index 763b816ba19ca..8710f5710c1d1 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -29,10 +29,10 @@
#include <asm/page.h>
#define EFI_SUCCESS 0
-#define EFI_LOAD_ERROR ( 1 | (1UL << (BITS_PER_LONG-1)))
+#define EFI_LOAD_ERROR ( 1 | (1UL << (BITS_PER_LONG-1)))
#define EFI_INVALID_PARAMETER ( 2 | (1UL << (BITS_PER_LONG-1)))
#define EFI_UNSUPPORTED ( 3 | (1UL << (BITS_PER_LONG-1)))
-#define EFI_BAD_BUFFER_SIZE ( 4 | (1UL << (BITS_PER_LONG-1)))
+#define EFI_BAD_BUFFER_SIZE ( 4 | (1UL << (BITS_PER_LONG-1)))
#define EFI_BUFFER_TOO_SMALL ( 5 | (1UL << (BITS_PER_LONG-1)))
#define EFI_NOT_READY ( 6 | (1UL << (BITS_PER_LONG-1)))
#define EFI_DEVICE_ERROR ( 7 | (1UL << (BITS_PER_LONG-1)))
@@ -167,8 +167,6 @@ struct capsule_info {
int __efi_capsule_setup_info(struct capsule_info *cap_info);
-typedef int (*efi_freemem_callback_t) (u64 start, u64 end, void *arg);
-
/*
* Types and defines for Time Services
*/
@@ -605,10 +603,6 @@ efi_guid_to_str(efi_guid_t *guid, char *out)
}
extern void efi_init (void);
-extern void *efi_get_pal_addr (void);
-extern void efi_map_pal_code (void);
-extern void efi_memmap_walk (efi_freemem_callback_t callback, void *arg);
-extern void efi_gettimeofday (struct timespec64 *ts);
#ifdef CONFIG_EFI
extern void efi_enter_virtual_mode (void); /* switch EFI to virtual mode, if possible */
#else
@@ -1110,13 +1104,6 @@ enum efi_secureboot_mode efi_get_secureboot_mode(efi_get_variable_t *get_var)
return efi_secureboot_mode_enabled;
}
-#ifdef CONFIG_RESET_ATTACK_MITIGATION
-void efi_enable_reset_attack_mitigation(void);
-#else
-static inline void
-efi_enable_reset_attack_mitigation(void) { }
-#endif
-
#ifdef CONFIG_EFI_EMBEDDED_FIRMWARE
void efi_check_for_embedded_firmwares(void);
#else
@@ -1125,8 +1112,6 @@ static inline void efi_check_for_embedded_firmwares(void) { }
efi_status_t efi_random_get_seed(void);
-void efi_retrieve_tpm2_eventlog(void);
-
/*
* Arch code can implement the following three template macros, avoiding
* reptition for the void/non-void return cases of {__,}efi_call_virt():
diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h
index a104b298019ae..883acef895bc4 100644
--- a/include/linux/entry-common.h
+++ b/include/linux/entry-common.h
@@ -2,6 +2,7 @@
#ifndef __LINUX_ENTRYCOMMON_H
#define __LINUX_ENTRYCOMMON_H
+#include <linux/static_call_types.h>
#include <linux/tracehook.h>
#include <linux/syscalls.h>
#include <linux/seccomp.h>
@@ -454,6 +455,9 @@ irqentry_state_t noinstr irqentry_enter(struct pt_regs *regs);
* Conditional reschedule with additional sanity checks.
*/
void irqentry_exit_cond_resched(void);
+#ifdef CONFIG_PREEMPT_DYNAMIC
+DECLARE_STATIC_CALL(irqentry_exit_cond_resched, irqentry_exit_cond_resched);
+#endif
/**
* irqentry_exit - Handle return from exception that used irqentry_enter()
diff --git a/include/linux/entry-kvm.h b/include/linux/entry-kvm.h
index 9b93f8584ff7d..8b2b1d68b9545 100644
--- a/include/linux/entry-kvm.h
+++ b/include/linux/entry-kvm.h
@@ -47,6 +47,20 @@ static inline int arch_xfer_to_guest_mode_handle_work(struct kvm_vcpu *vcpu,
int xfer_to_guest_mode_handle_work(struct kvm_vcpu *vcpu);
/**
+ * xfer_to_guest_mode_prepare - Perform last minute preparation work that
+ * need to be handled while IRQs are disabled
+ * upon entering to guest.
+ *
+ * Has to be invoked with interrupts disabled before the last call
+ * to xfer_to_guest_mode_work_pending().
+ */
+static inline void xfer_to_guest_mode_prepare(void)
+{
+ lockdep_assert_irqs_disabled();
+ rcu_nocb_flush_deferred_wakeup();
+}
+
+/**
* __xfer_to_guest_mode_work_pending - Check if work is pending
*
* Returns: True if work pending, False otherwise.
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index bb8ff9083e7db..967e257671534 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -569,15 +569,6 @@ struct softirq_action
asmlinkage void do_softirq(void);
asmlinkage void __do_softirq(void);
-#ifdef __ARCH_HAS_DO_SOFTIRQ
-void do_softirq_own_stack(void);
-#else
-static inline void do_softirq_own_stack(void)
-{
- __do_softirq();
-}
-#endif
-
extern void open_softirq(int nr, void (*action)(struct softirq_action *));
extern void softirq_init(void);
extern void __raise_softirq_irqoff(unsigned int nr);
diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h
index 8de0e1373de70..600c10da321a7 100644
--- a/include/linux/irqflags.h
+++ b/include/linux/irqflags.h
@@ -149,6 +149,17 @@ do { \
# define start_critical_timings() do { } while (0)
#endif
+#ifdef CONFIG_DEBUG_IRQFLAGS
+extern void warn_bogus_irq_restore(void);
+#define raw_check_bogus_irq_restore() \
+ do { \
+ if (unlikely(!arch_irqs_disabled())) \
+ warn_bogus_irq_restore(); \
+ } while (0)
+#else
+#define raw_check_bogus_irq_restore() do { } while (0)
+#endif
+
/*
* Wrap the arch provided IRQ routines to provide appropriate checks.
*/
@@ -162,6 +173,7 @@ do { \
#define raw_local_irq_restore(flags) \
do { \
typecheck(unsigned long, flags); \
+ raw_check_bogus_irq_restore(); \
arch_local_irq_restore(flags); \
} while (0)
#define raw_local_save_flags(flags) \
diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h
index 32809624d422e..d92691262f51a 100644
--- a/include/linux/jump_label.h
+++ b/include/linux/jump_label.h
@@ -261,14 +261,14 @@ static __always_inline void jump_label_init(void)
static __always_inline bool static_key_false(struct static_key *key)
{
- if (unlikely(static_key_count(key) > 0))
+ if (unlikely_notrace(static_key_count(key) > 0))
return true;
return false;
}
static __always_inline bool static_key_true(struct static_key *key)
{
- if (likely(static_key_count(key) > 0))
+ if (likely_notrace(static_key_count(key) > 0))
return true;
return false;
}
@@ -460,7 +460,7 @@ extern bool ____wrong_branch_error(void);
branch = !arch_static_branch_jump(&(x)->key, true); \
else \
branch = ____wrong_branch_error(); \
- likely(branch); \
+ likely_notrace(branch); \
})
#define static_branch_unlikely(x) \
@@ -472,13 +472,13 @@ extern bool ____wrong_branch_error(void);
branch = arch_static_branch(&(x)->key, false); \
else \
branch = ____wrong_branch_error(); \
- unlikely(branch); \
+ unlikely_notrace(branch); \
})
#else /* !CONFIG_JUMP_LABEL */
-#define static_branch_likely(x) likely(static_key_enabled(&(x)->key))
-#define static_branch_unlikely(x) unlikely(static_key_enabled(&(x)->key))
+#define static_branch_likely(x) likely_notrace(static_key_enabled(&(x)->key))
+#define static_branch_unlikely(x) unlikely_notrace(static_key_enabled(&(x)->key))
#endif /* CONFIG_JUMP_LABEL */
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index f7902d8c10481..5b7ed6dc99acc 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -15,7 +15,7 @@
#include <linux/typecheck.h>
#include <linux/printk.h>
#include <linux/build_bug.h>
-
+#include <linux/static_call_types.h>
#include <asm/byteorder.h>
#include <uapi/linux/kernel.h>
@@ -81,11 +81,26 @@ struct pt_regs;
struct user;
#ifdef CONFIG_PREEMPT_VOLUNTARY
-extern int _cond_resched(void);
-# define might_resched() _cond_resched()
+
+extern int __cond_resched(void);
+# define might_resched() __cond_resched()
+
+#elif defined(CONFIG_PREEMPT_DYNAMIC)
+
+extern int __cond_resched(void);
+
+DECLARE_STATIC_CALL(might_resched, __cond_resched);
+
+static __always_inline void might_resched(void)
+{
+ static_call_mod(might_resched)();
+}
+
#else
+
# define might_resched() do { } while (0)
-#endif
+
+#endif /* CONFIG_PREEMPT_* */
#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
extern void ___might_sleep(const char *file, int line, int preempt_offset);
diff --git a/include/linux/list.h b/include/linux/list.h
index 89bdc92e75c33..f2af4b4aa4e9a 100644
--- a/include/linux/list.h
+++ b/include/linux/list.h
@@ -901,7 +901,7 @@ static inline void hlist_add_before(struct hlist_node *n,
}
/**
- * hlist_add_behing - add a new entry after the one specified
+ * hlist_add_behind - add a new entry after the one specified
* @n: new entry to be added
* @prev: hlist node to add it after, which must be non-NULL
*/
diff --git a/include/linux/local_lock_internal.h b/include/linux/local_lock_internal.h
index 4a8795b21d774..ded90b097e6e8 100644
--- a/include/linux/local_lock_internal.h
+++ b/include/linux/local_lock_internal.h
@@ -18,6 +18,7 @@ typedef struct {
.dep_map = { \
.name = #lockname, \
.wait_type_inner = LD_WAIT_CONFIG, \
+ .lock_type = LD_LOCK_PERCPU, \
}
#else
# define LL_DEP_MAP_INIT(lockname)
@@ -30,7 +31,9 @@ do { \
static struct lock_class_key __key; \
\
debug_check_no_locks_freed((void *)lock, sizeof(*lock));\
- lockdep_init_map_wait(&(lock)->dep_map, #lock, &__key, 0, LD_WAIT_CONFIG);\
+ lockdep_init_map_type(&(lock)->dep_map, #lock, &__key, 0, \
+ LD_WAIT_CONFIG, LD_WAIT_INV, \
+ LD_LOCK_PERCPU); \
} while (0)
#ifdef CONFIG_DEBUG_LOCK_ALLOC
diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index b9e9adec73e8b..7b7ebf2e28ec5 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -185,12 +185,19 @@ extern void lockdep_unregister_key(struct lock_class_key *key);
* to lockdep:
*/
-extern void lockdep_init_map_waits(struct lockdep_map *lock, const char *name,
- struct lock_class_key *key, int subclass, short inner, short outer);
+extern void lockdep_init_map_type(struct lockdep_map *lock, const char *name,
+ struct lock_class_key *key, int subclass, u8 inner, u8 outer, u8 lock_type);
+
+static inline void
+lockdep_init_map_waits(struct lockdep_map *lock, const char *name,
+ struct lock_class_key *key, int subclass, u8 inner, u8 outer)
+{
+ lockdep_init_map_type(lock, name, key, subclass, inner, LD_WAIT_INV, LD_LOCK_NORMAL);
+}
static inline void
lockdep_init_map_wait(struct lockdep_map *lock, const char *name,
- struct lock_class_key *key, int subclass, short inner)
+ struct lock_class_key *key, int subclass, u8 inner)
{
lockdep_init_map_waits(lock, name, key, subclass, inner, LD_WAIT_INV);
}
@@ -340,6 +347,8 @@ static inline void lockdep_set_selftest_task(struct task_struct *task)
# define lock_set_class(l, n, k, s, i) do { } while (0)
# define lock_set_subclass(l, s, i) do { } while (0)
# define lockdep_init() do { } while (0)
+# define lockdep_init_map_type(lock, name, key, sub, inner, outer, type) \
+ do { (void)(name); (void)(key); } while (0)
# define lockdep_init_map_waits(lock, name, key, sub, inner, outer) \
do { (void)(name); (void)(key); } while (0)
# define lockdep_init_map_wait(lock, name, key, sub, inner) \
diff --git a/include/linux/lockdep_types.h b/include/linux/lockdep_types.h
index 9a1fd49df17f6..2ec9ff5a7fff0 100644
--- a/include/linux/lockdep_types.h
+++ b/include/linux/lockdep_types.h
@@ -30,6 +30,12 @@ enum lockdep_wait_type {
LD_WAIT_MAX, /* must be last */
};
+enum lockdep_lock_type {
+ LD_LOCK_NORMAL = 0, /* normal, catch all */
+ LD_LOCK_PERCPU, /* percpu */
+ LD_LOCK_MAX,
+};
+
#ifdef CONFIG_LOCKDEP
/*
@@ -119,8 +125,10 @@ struct lock_class {
int name_version;
const char *name;
- short wait_type_inner;
- short wait_type_outer;
+ u8 wait_type_inner;
+ u8 wait_type_outer;
+ u8 lock_type;
+ /* u8 hole; */
#ifdef CONFIG_LOCK_STAT
unsigned long contention_point[LOCKSTAT_POINTS];
@@ -169,8 +177,10 @@ struct lockdep_map {
struct lock_class_key *key;
struct lock_class *class_cache[NR_LOCKDEP_CACHING_CLASSES];
const char *name;
- short wait_type_outer; /* can be taken in this context */
- short wait_type_inner; /* presents this context */
+ u8 wait_type_outer; /* can be taken in this context */
+ u8 wait_type_inner; /* presents this context */
+ u8 lock_type;
+ /* u8 hole; */
#ifdef CONFIG_LOCK_STAT
int cpu;
unsigned long ip;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 7672ce12571b1..ab959488bc0f5 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3182,5 +3182,7 @@ unsigned long wp_shared_mapping_range(struct address_space *mapping,
extern int sysctl_nr_trim_pages;
+void mem_dump_obj(void *object);
+
#endif /* __KERNEL__ */
#endif /* _LINUX_MM_H */
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 07d9acb5b19c4..0974ad501a47c 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -588,10 +588,9 @@ static inline cpumask_t *mm_cpumask(struct mm_struct *mm)
}
struct mmu_gather;
-extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
- unsigned long start, unsigned long end);
-extern void tlb_finish_mmu(struct mmu_gather *tlb,
- unsigned long start, unsigned long end);
+extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm);
+extern void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm);
+extern void tlb_finish_mmu(struct mmu_gather *tlb);
static inline void init_tlb_flush_pending(struct mm_struct *mm)
{
diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index dcd185cbfe793..0cd631a197276 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -199,29 +199,4 @@ extern void mutex_unlock(struct mutex *lock);
extern int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock);
-/*
- * These values are chosen such that FAIL and SUCCESS match the
- * values of the regular mutex_trylock().
- */
-enum mutex_trylock_recursive_enum {
- MUTEX_TRYLOCK_FAILED = 0,
- MUTEX_TRYLOCK_SUCCESS = 1,
- MUTEX_TRYLOCK_RECURSIVE,
-};
-
-/**
- * mutex_trylock_recursive - trylock variant that allows recursive locking
- * @lock: mutex to be locked
- *
- * This function should not be used, _ever_. It is purely for hysterical GEM
- * raisins, and once those are gone this will be removed.
- *
- * Returns:
- * - MUTEX_TRYLOCK_FAILED - trylock failed,
- * - MUTEX_TRYLOCK_SUCCESS - lock acquired,
- * - MUTEX_TRYLOCK_RECURSIVE - we already owned the lock.
- */
-extern /* __deprecated */ __must_check enum mutex_trylock_recursive_enum
-mutex_trylock_recursive(struct mutex *lock);
-
#endif /* __LINUX_MUTEX_H */
diff --git a/include/linux/objtool.h b/include/linux/objtool.h
index 577f51436cf92..7e72d975cb761 100644
--- a/include/linux/objtool.h
+++ b/include/linux/objtool.h
@@ -29,11 +29,14 @@ struct unwind_hint {
*
* UNWIND_HINT_TYPE_REGS_PARTIAL: Used in entry code to indicate that
* sp_reg+sp_offset points to the iret return frame.
+ *
+ * UNWIND_HINT_FUNC: Generate the unwind metadata of a callable function.
+ * Useful for code which doesn't have an ELF function annotation.
*/
#define UNWIND_HINT_TYPE_CALL 0
#define UNWIND_HINT_TYPE_REGS 1
#define UNWIND_HINT_TYPE_REGS_PARTIAL 2
-#define UNWIND_HINT_TYPE_RET_OFFSET 3
+#define UNWIND_HINT_TYPE_FUNC 3
#ifdef CONFIG_STACK_VALIDATION
@@ -109,6 +112,12 @@ struct unwind_hint {
.popsection
.endm
+.macro STACK_FRAME_NON_STANDARD func:req
+ .pushsection .discard.func_stack_frame_non_standard, "aw"
+ .long \func - .
+ .popsection
+.endm
+
#endif /* __ASSEMBLY__ */
#else /* !CONFIG_STACK_VALIDATION */
@@ -122,6 +131,8 @@ struct unwind_hint {
#define ANNOTATE_INTRA_FUNCTION_CALL
.macro UNWIND_HINT sp_reg:req sp_offset=0 type:req end=0
.endm
+.macro STACK_FRAME_NON_STANDARD func:req
+.endm
#endif
#endif /* CONFIG_STACK_VALIDATION */
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 9a38f579bc764..fab42cfbd3502 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -998,7 +998,7 @@ struct perf_sample_data {
struct perf_raw_record *raw;
struct perf_branch_stack *br_stack;
u64 period;
- u64 weight;
+ union perf_sample_weight weight;
u64 txn;
union perf_mem_data_src data_src;
@@ -1047,7 +1047,7 @@ static inline void perf_sample_data_init(struct perf_sample_data *data,
data->raw = NULL;
data->br_stack = NULL;
data->period = period;
- data->weight = 0;
+ data->weight.full = 0;
data->data_src.val = PERF_MEM_NA;
data->txn = 0;
}
diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h
index d7db179963221..d31ecaf4fdd3d 100644
--- a/include/linux/rbtree.h
+++ b/include/linux/rbtree.h
@@ -141,12 +141,18 @@ static inline void rb_insert_color_cached(struct rb_node *node,
rb_insert_color(node, &root->rb_root);
}
-static inline void rb_erase_cached(struct rb_node *node,
- struct rb_root_cached *root)
+
+static inline struct rb_node *
+rb_erase_cached(struct rb_node *node, struct rb_root_cached *root)
{
+ struct rb_node *leftmost = NULL;
+
if (root->rb_leftmost == node)
- root->rb_leftmost = rb_next(node);
+ leftmost = root->rb_leftmost = rb_next(node);
+
rb_erase(node, &root->rb_root);
+
+ return leftmost;
}
static inline void rb_replace_node_cached(struct rb_node *victim,
@@ -158,4 +164,198 @@ static inline void rb_replace_node_cached(struct rb_node *victim,
rb_replace_node(victim, new, &root->rb_root);
}
+/*
+ * The below helper functions use 2 operators with 3 different
+ * calling conventions. The operators are related like:
+ *
+ * comp(a->key,b) < 0 := less(a,b)
+ * comp(a->key,b) > 0 := less(b,a)
+ * comp(a->key,b) == 0 := !less(a,b) && !less(b,a)
+ *
+ * If these operators define a partial order on the elements we make no
+ * guarantee on which of the elements matching the key is found. See
+ * rb_find().
+ *
+ * The reason for this is to allow the find() interface without requiring an
+ * on-stack dummy object, which might not be feasible due to object size.
+ */
+
+/**
+ * rb_add_cached() - insert @node into the leftmost cached tree @tree
+ * @node: node to insert
+ * @tree: leftmost cached tree to insert @node into
+ * @less: operator defining the (partial) node order
+ *
+ * Returns @node when it is the new leftmost, or NULL.
+ */
+static __always_inline struct rb_node *
+rb_add_cached(struct rb_node *node, struct rb_root_cached *tree,
+ bool (*less)(struct rb_node *, const struct rb_node *))
+{
+ struct rb_node **link = &tree->rb_root.rb_node;
+ struct rb_node *parent = NULL;
+ bool leftmost = true;
+
+ while (*link) {
+ parent = *link;
+ if (less(node, parent)) {
+ link = &parent->rb_left;
+ } else {
+ link = &parent->rb_right;
+ leftmost = false;
+ }
+ }
+
+ rb_link_node(node, parent, link);
+ rb_insert_color_cached(node, tree, leftmost);
+
+ return leftmost ? node : NULL;
+}
+
+/**
+ * rb_add() - insert @node into @tree
+ * @node: node to insert
+ * @tree: tree to insert @node into
+ * @less: operator defining the (partial) node order
+ */
+static __always_inline void
+rb_add(struct rb_node *node, struct rb_root *tree,
+ bool (*less)(struct rb_node *, const struct rb_node *))
+{
+ struct rb_node **link = &tree->rb_node;
+ struct rb_node *parent = NULL;
+
+ while (*link) {
+ parent = *link;
+ if (less(node, parent))
+ link = &parent->rb_left;
+ else
+ link = &parent->rb_right;
+ }
+
+ rb_link_node(node, parent, link);
+ rb_insert_color(node, tree);
+}
+
+/**
+ * rb_find_add() - find equivalent @node in @tree, or add @node
+ * @node: node to look-for / insert
+ * @tree: tree to search / modify
+ * @cmp: operator defining the node order
+ *
+ * Returns the rb_node matching @node, or NULL when no match is found and @node
+ * is inserted.
+ */
+static __always_inline struct rb_node *
+rb_find_add(struct rb_node *node, struct rb_root *tree,
+ int (*cmp)(struct rb_node *, const struct rb_node *))
+{
+ struct rb_node **link = &tree->rb_node;
+ struct rb_node *parent = NULL;
+ int c;
+
+ while (*link) {
+ parent = *link;
+ c = cmp(node, parent);
+
+ if (c < 0)
+ link = &parent->rb_left;
+ else if (c > 0)
+ link = &parent->rb_right;
+ else
+ return parent;
+ }
+
+ rb_link_node(node, parent, link);
+ rb_insert_color(node, tree);
+ return NULL;
+}
+
+/**
+ * rb_find() - find @key in tree @tree
+ * @key: key to match
+ * @tree: tree to search
+ * @cmp: operator defining the node order
+ *
+ * Returns the rb_node matching @key or NULL.
+ */
+static __always_inline struct rb_node *
+rb_find(const void *key, const struct rb_root *tree,
+ int (*cmp)(const void *key, const struct rb_node *))
+{
+ struct rb_node *node = tree->rb_node;
+
+ while (node) {
+ int c = cmp(key, node);
+
+ if (c < 0)
+ node = node->rb_left;
+ else if (c > 0)
+ node = node->rb_right;
+ else
+ return node;
+ }
+
+ return NULL;
+}
+
+/**
+ * rb_find_first() - find the first @key in @tree
+ * @key: key to match
+ * @tree: tree to search
+ * @cmp: operator defining node order
+ *
+ * Returns the leftmost node matching @key, or NULL.
+ */
+static __always_inline struct rb_node *
+rb_find_first(const void *key, const struct rb_root *tree,
+ int (*cmp)(const void *key, const struct rb_node *))
+{
+ struct rb_node *node = tree->rb_node;
+ struct rb_node *match = NULL;
+
+ while (node) {
+ int c = cmp(key, node);
+
+ if (c <= 0) {
+ if (!c)
+ match = node;
+ node = node->rb_left;
+ } else if (c > 0) {
+ node = node->rb_right;
+ }
+ }
+
+ return match;
+}
+
+/**
+ * rb_next_match() - find the next @key in @tree
+ * @key: key to match
+ * @tree: tree to search
+ * @cmp: operator defining node order
+ *
+ * Returns the next node matching @key, or NULL.
+ */
+static __always_inline struct rb_node *
+rb_next_match(const void *key, struct rb_node *node,
+ int (*cmp)(const void *key, const struct rb_node *))
+{
+ node = rb_next(node);
+ if (node && cmp(key, node))
+ node = NULL;
+ return node;
+}
+
+/**
+ * rb_for_each() - iterates a subtree matching @key
+ * @node: iterator
+ * @key: key to match
+ * @tree: tree to search
+ * @cmp: operator defining node order
+ */
+#define rb_for_each(node, key, tree, cmp) \
+ for ((node) = rb_find_first((key), (tree), (cmp)); \
+ (node); (node) = rb_next_match((key), (node), (cmp)))
+
#endif /* _LINUX_RBTREE_H */
diff --git a/include/linux/rcu_segcblist.h b/include/linux/rcu_segcblist.h
index b36afe7b22c9a..8afe886e85f10 100644
--- a/include/linux/rcu_segcblist.h
+++ b/include/linux/rcu_segcblist.h
@@ -63,6 +63,122 @@ struct rcu_cblist {
#define RCU_NEXT_TAIL 3
#define RCU_CBLIST_NSEGS 4
+
+/*
+ * ==NOCB Offloading state machine==
+ *
+ *
+ * ----------------------------------------------------------------------------
+ * | SEGCBLIST_SOFTIRQ_ONLY |
+ * | |
+ * | Callbacks processed by rcu_core() from softirqs or local |
+ * | rcuc kthread, without holding nocb_lock. |
+ * ----------------------------------------------------------------------------
+ * |
+ * v
+ * ----------------------------------------------------------------------------
+ * | SEGCBLIST_OFFLOADED |
+ * | |
+ * | Callbacks processed by rcu_core() from softirqs or local |
+ * | rcuc kthread, while holding nocb_lock. Waking up CB and GP kthreads, |
+ * | allowing nocb_timer to be armed. |
+ * ----------------------------------------------------------------------------
+ * |
+ * v
+ * -----------------------------------
+ * | |
+ * v v
+ * --------------------------------------- ----------------------------------|
+ * | SEGCBLIST_OFFLOADED | | | SEGCBLIST_OFFLOADED | |
+ * | SEGCBLIST_KTHREAD_CB | | SEGCBLIST_KTHREAD_GP |
+ * | | | |
+ * | | | |
+ * | CB kthread woke up and | | GP kthread woke up and |
+ * | acknowledged SEGCBLIST_OFFLOADED. | | acknowledged SEGCBLIST_OFFLOADED|
+ * | Processes callbacks concurrently | | |
+ * | with rcu_core(), holding | | |
+ * | nocb_lock. | | |
+ * --------------------------------------- -----------------------------------
+ * | |
+ * -----------------------------------
+ * |
+ * v
+ * |--------------------------------------------------------------------------|
+ * | SEGCBLIST_OFFLOADED | |
+ * | SEGCBLIST_KTHREAD_CB | |
+ * | SEGCBLIST_KTHREAD_GP |
+ * | |
+ * | Kthreads handle callbacks holding nocb_lock, local rcu_core() stops |
+ * | handling callbacks. |
+ * ----------------------------------------------------------------------------
+ */
+
+
+
+/*
+ * ==NOCB De-Offloading state machine==
+ *
+ *
+ * |--------------------------------------------------------------------------|
+ * | SEGCBLIST_OFFLOADED | |
+ * | SEGCBLIST_KTHREAD_CB | |
+ * | SEGCBLIST_KTHREAD_GP |
+ * | |
+ * | CB/GP kthreads handle callbacks holding nocb_lock, local rcu_core() |
+ * | ignores callbacks. |
+ * ----------------------------------------------------------------------------
+ * |
+ * v
+ * |--------------------------------------------------------------------------|
+ * | SEGCBLIST_KTHREAD_CB | |
+ * | SEGCBLIST_KTHREAD_GP |
+ * | |
+ * | CB/GP kthreads and local rcu_core() handle callbacks concurrently |
+ * | holding nocb_lock. Wake up CB and GP kthreads if necessary. |
+ * ----------------------------------------------------------------------------
+ * |
+ * v
+ * -----------------------------------
+ * | |
+ * v v
+ * ---------------------------------------------------------------------------|
+ * | |
+ * | SEGCBLIST_KTHREAD_CB | SEGCBLIST_KTHREAD_GP |
+ * | | |
+ * | GP kthread woke up and | CB kthread woke up and |
+ * | acknowledged the fact that | acknowledged the fact that |
+ * | SEGCBLIST_OFFLOADED got cleared. | SEGCBLIST_OFFLOADED got cleared. |
+ * | | The CB kthread goes to sleep |
+ * | The callbacks from the target CPU | until it ever gets re-offloaded. |
+ * | will be ignored from the GP kthread | |
+ * | loop. | |
+ * ----------------------------------------------------------------------------
+ * | |
+ * -----------------------------------
+ * |
+ * v
+ * ----------------------------------------------------------------------------
+ * | 0 |
+ * | |
+ * | Callbacks processed by rcu_core() from softirqs or local |
+ * | rcuc kthread, while holding nocb_lock. Forbid nocb_timer to be armed. |
+ * | Flush pending nocb_timer. Flush nocb bypass callbacks. |
+ * ----------------------------------------------------------------------------
+ * |
+ * v
+ * ----------------------------------------------------------------------------
+ * | SEGCBLIST_SOFTIRQ_ONLY |
+ * | |
+ * | Callbacks processed by rcu_core() from softirqs or local |
+ * | rcuc kthread, without holding nocb_lock. |
+ * ----------------------------------------------------------------------------
+ */
+#define SEGCBLIST_ENABLED BIT(0)
+#define SEGCBLIST_SOFTIRQ_ONLY BIT(1)
+#define SEGCBLIST_KTHREAD_CB BIT(2)
+#define SEGCBLIST_KTHREAD_GP BIT(3)
+#define SEGCBLIST_OFFLOADED BIT(4)
+
struct rcu_segcblist {
struct rcu_head *head;
struct rcu_head **tails[RCU_CBLIST_NSEGS];
@@ -72,8 +188,8 @@ struct rcu_segcblist {
#else
long len;
#endif
- u8 enabled;
- u8 offloaded;
+ long seglen[RCU_CBLIST_NSEGS];
+ u8 flags;
};
#define RCU_SEGCBLIST_INITIALIZER(n) \
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index fd02c5fa60cb1..bd04f722714f6 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -33,6 +33,8 @@
#define ULONG_CMP_GE(a, b) (ULONG_MAX / 2 >= (a) - (b))
#define ULONG_CMP_LT(a, b) (ULONG_MAX / 2 < (a) - (b))
#define ulong2long(a) (*(long *)(&(a)))
+#define USHORT_CMP_GE(a, b) (USHRT_MAX / 2 >= (unsigned short)((a) - (b)))
+#define USHORT_CMP_LT(a, b) (USHRT_MAX / 2 < (unsigned short)((a) - (b)))
/* Exported common interfaces */
void call_rcu(struct rcu_head *head, rcu_callback_t func);
@@ -110,8 +112,14 @@ static inline void rcu_user_exit(void) { }
#ifdef CONFIG_RCU_NOCB_CPU
void rcu_init_nohz(void);
+int rcu_nocb_cpu_offload(int cpu);
+int rcu_nocb_cpu_deoffload(int cpu);
+void rcu_nocb_flush_deferred_wakeup(void);
#else /* #ifdef CONFIG_RCU_NOCB_CPU */
static inline void rcu_init_nohz(void) { }
+static inline int rcu_nocb_cpu_offload(int cpu) { return -EINVAL; }
+static inline int rcu_nocb_cpu_deoffload(int cpu) { return 0; }
+static inline void rcu_nocb_flush_deferred_wakeup(void) { }
#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
/**
@@ -846,19 +854,11 @@ static inline notrace void rcu_read_unlock_sched_notrace(void)
*/
#define __is_kvfree_rcu_offset(offset) ((offset) < 4096)
-/*
- * Helper macro for kfree_rcu() to prevent argument-expansion eyestrain.
- */
-#define __kvfree_rcu(head, offset) \
- do { \
- BUILD_BUG_ON(!__is_kvfree_rcu_offset(offset)); \
- kvfree_call_rcu(head, (rcu_callback_t)(unsigned long)(offset)); \
- } while (0)
-
/**
* kfree_rcu() - kfree an object after a grace period.
- * @ptr: pointer to kfree
- * @rhf: the name of the struct rcu_head within the type of @ptr.
+ * @ptr: pointer to kfree for both single- and double-argument invocations.
+ * @rhf: the name of the struct rcu_head within the type of @ptr,
+ * but only for double-argument invocations.
*
* Many rcu callbacks functions just call kfree() on the base structure.
* These functions are trivial, but their size adds up, and furthermore
@@ -871,7 +871,7 @@ static inline notrace void rcu_read_unlock_sched_notrace(void)
* Because the functions are not allowed in the low-order 4096 bytes of
* kernel virtual memory, offsets up to 4095 bytes can be accommodated.
* If the offset is larger than 4095 bytes, a compile-time error will
- * be generated in __kvfree_rcu(). If this error is triggered, you can
+ * be generated in kvfree_rcu_arg_2(). If this error is triggered, you can
* either fall back to use of call_rcu() or rearrange the structure to
* position the rcu_head structure into the first 4096 bytes.
*
@@ -881,13 +881,7 @@ static inline notrace void rcu_read_unlock_sched_notrace(void)
* The BUILD_BUG_ON check must not involve any function calls, hence the
* checks are done in macros here.
*/
-#define kfree_rcu(ptr, rhf) \
-do { \
- typeof (ptr) ___p = (ptr); \
- \
- if (___p) \
- __kvfree_rcu(&((___p)->rhf), offsetof(typeof(*(ptr)), rhf)); \
-} while (0)
+#define kfree_rcu kvfree_rcu
/**
* kvfree_rcu() - kvfree an object after a grace period.
@@ -919,7 +913,17 @@ do { \
kvfree_rcu_arg_2, kvfree_rcu_arg_1)(__VA_ARGS__)
#define KVFREE_GET_MACRO(_1, _2, NAME, ...) NAME
-#define kvfree_rcu_arg_2(ptr, rhf) kfree_rcu(ptr, rhf)
+#define kvfree_rcu_arg_2(ptr, rhf) \
+do { \
+ typeof (ptr) ___p = (ptr); \
+ \
+ if (___p) { \
+ BUILD_BUG_ON(!__is_kvfree_rcu_offset(offsetof(typeof(*(ptr)), rhf))); \
+ kvfree_call_rcu(&((___p)->rhf), (rcu_callback_t)(unsigned long) \
+ (offsetof(typeof(*(ptr)), rhf))); \
+ } \
+} while (0)
+
#define kvfree_rcu_arg_1(ptr) \
do { \
typeof(ptr) ___p = (ptr); \
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6e3a5eeec509a..4d568288abf9f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -47,6 +47,7 @@ struct cfs_rq;
struct fs_struct;
struct futex_pi_state;
struct io_context;
+struct io_uring_task;
struct mempolicy;
struct nameidata;
struct nsproxy;
@@ -65,7 +66,6 @@ struct sighand_struct;
struct signal_struct;
struct task_delay_info;
struct task_group;
-struct io_uring_task;
/*
* Task state bitmask. NOTE! These bits are also
@@ -1871,11 +1871,32 @@ static inline int test_tsk_need_resched(struct task_struct *tsk)
* value indicates whether a reschedule was done in fact.
* cond_resched_lock() will drop the spinlock before scheduling,
*/
-#ifndef CONFIG_PREEMPTION
-extern int _cond_resched(void);
+#if !defined(CONFIG_PREEMPTION) || defined(CONFIG_PREEMPT_DYNAMIC)
+extern int __cond_resched(void);
+
+#ifdef CONFIG_PREEMPT_DYNAMIC
+
+DECLARE_STATIC_CALL(cond_resched, __cond_resched);
+
+static __always_inline int _cond_resched(void)
+{
+ return static_call_mod(cond_resched)();
+}
+
+#else
+
+static inline int _cond_resched(void)
+{
+ return __cond_resched();
+}
+
+#endif /* CONFIG_PREEMPT_DYNAMIC */
+
#else
+
static inline int _cond_resched(void) { return 0; }
-#endif
+
+#endif /* !defined(CONFIG_PREEMPTION) || defined(CONFIG_PREEMPT_DYNAMIC) */
#define cond_resched() ({ \
___might_sleep(__FILE__, __LINE__, 0); \
@@ -1968,6 +1989,11 @@ extern long sched_getaffinity(pid_t pid, struct cpumask *mask);
#define TASK_SIZE_OF(tsk) TASK_SIZE
#endif
+#ifdef CONFIG_SMP
+/* Returns effective CPU energy utilization, as seen by the scheduler */
+unsigned long sched_cpu_util(int cpu, unsigned long max);
+#endif /* CONFIG_SMP */
+
#ifdef CONFIG_RSEQ
/*
diff --git a/include/linux/sched/prio.h b/include/linux/sched/prio.h
index 7d64feafc408e..ab83d85e1183a 100644
--- a/include/linux/sched/prio.h
+++ b/include/linux/sched/prio.h
@@ -11,16 +11,9 @@
* priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH
* tasks are in the range MAX_RT_PRIO..MAX_PRIO-1. Priority
* values are inverted: lower p->prio value means higher priority.
- *
- * The MAX_USER_RT_PRIO value allows the actual maximum
- * RT priority to be separate from the value exported to
- * user-space. This allows kernel threads to set their
- * priority to a value higher than any user task. Note:
- * MAX_RT_PRIO must not be smaller than MAX_USER_RT_PRIO.
*/
-#define MAX_USER_RT_PRIO 100
-#define MAX_RT_PRIO MAX_USER_RT_PRIO
+#define MAX_RT_PRIO 100
#define MAX_PRIO (MAX_RT_PRIO + NICE_WIDTH)
#define DEFAULT_PRIO (MAX_RT_PRIO + NICE_WIDTH / 2)
@@ -34,15 +27,6 @@
#define PRIO_TO_NICE(prio) ((prio) - DEFAULT_PRIO)
/*
- * 'User priority' is the nice value converted to something we
- * can work with better when scaling various scheduler parameters,
- * it's a [ 0 ... 39 ] range.
- */
-#define USER_PRIO(p) ((p)-MAX_RT_PRIO)
-#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)
-#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
-
-/*
* Convert nice value [19,-20] to rlimit style value [1,40].
*/
static inline long nice_to_rlimit(long nice)
diff --git a/include/linux/slab.h b/include/linux/slab.h
index be4ba5867ac5f..7ae6040767670 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -186,6 +186,8 @@ void kfree(const void *);
void kfree_sensitive(const void *);
size_t __ksize(const void *);
size_t ksize(const void *);
+bool kmem_valid_obj(void *object);
+void kmem_dump_obj(void *object);
#ifdef CONFIG_HAVE_HARDENED_USERCOPY_ALLOCATOR
void __check_heap_object(const void *ptr, unsigned long n, struct page *page,
diff --git a/include/linux/srcu.h b/include/linux/srcu.h
index e432cc92c73de..a0895bbf71ce0 100644
--- a/include/linux/srcu.h
+++ b/include/linux/srcu.h
@@ -60,6 +60,9 @@ void cleanup_srcu_struct(struct srcu_struct *ssp);
int __srcu_read_lock(struct srcu_struct *ssp) __acquires(ssp);
void __srcu_read_unlock(struct srcu_struct *ssp, int idx) __releases(ssp);
void synchronize_srcu(struct srcu_struct *ssp);
+unsigned long get_state_synchronize_srcu(struct srcu_struct *ssp);
+unsigned long start_poll_synchronize_srcu(struct srcu_struct *ssp);
+bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie);
#ifdef CONFIG_DEBUG_LOCK_ALLOC
diff --git a/include/linux/srcutiny.h b/include/linux/srcutiny.h
index 5a5a1941ca156..0e0cf4d6a72a0 100644
--- a/include/linux/srcutiny.h
+++ b/include/linux/srcutiny.h
@@ -15,7 +15,8 @@
struct srcu_struct {
short srcu_lock_nesting[2]; /* srcu_read_lock() nesting depth. */
- short srcu_idx; /* Current reader array element. */
+ unsigned short srcu_idx; /* Current reader array element in bit 0x2. */
+ unsigned short srcu_idx_max; /* Furthest future srcu_idx request. */
u8 srcu_gp_running; /* GP workqueue running? */
u8 srcu_gp_waiting; /* GP waiting for readers? */
struct swait_queue_head srcu_wq;
@@ -59,7 +60,7 @@ static inline int __srcu_read_lock(struct srcu_struct *ssp)
{
int idx;
- idx = READ_ONCE(ssp->srcu_idx);
+ idx = ((READ_ONCE(ssp->srcu_idx) + 1) & 0x2) >> 1;
WRITE_ONCE(ssp->srcu_lock_nesting[idx], ssp->srcu_lock_nesting[idx] + 1);
return idx;
}
@@ -80,7 +81,7 @@ static inline void srcu_torture_stats_print(struct srcu_struct *ssp,
{
int idx;
- idx = READ_ONCE(ssp->srcu_idx) & 0x1;
+ idx = ((READ_ONCE(ssp->srcu_idx) + 1) & 0x2) >> 1;
pr_alert("%s%s Tiny SRCU per-CPU(idx=%d): (%hd,%hd)\n",
tt, tf, idx,
READ_ONCE(ssp->srcu_lock_nesting[!idx]),
diff --git a/include/linux/static_call.h b/include/linux/static_call.h
index 695da4c9b3381..85ecc789f4ffd 100644
--- a/include/linux/static_call.h
+++ b/include/linux/static_call.h
@@ -107,26 +107,10 @@ extern void arch_static_call_transform(void *site, void *tramp, void *func, bool
#define STATIC_CALL_TRAMP_ADDR(name) &STATIC_CALL_TRAMP(name)
-/*
- * __ADDRESSABLE() is used to ensure the key symbol doesn't get stripped from
- * the symbol table so that objtool can reference it when it generates the
- * .static_call_sites section.
- */
-#define __static_call(name) \
-({ \
- __ADDRESSABLE(STATIC_CALL_KEY(name)); \
- &STATIC_CALL_TRAMP(name); \
-})
-
#else
#define STATIC_CALL_TRAMP_ADDR(name) NULL
#endif
-
-#define DECLARE_STATIC_CALL(name, func) \
- extern struct static_call_key STATIC_CALL_KEY(name); \
- extern typeof(func) STATIC_CALL_TRAMP(name);
-
#define static_call_update(name, func) \
({ \
BUILD_BUG_ON(!__same_type(*(func), STATIC_CALL_TRAMP(name))); \
@@ -154,17 +138,25 @@ struct static_call_key {
};
};
+/* For finding the key associated with a trampoline */
+struct static_call_tramp_key {
+ s32 tramp;
+ s32 key;
+};
+
extern void __static_call_update(struct static_call_key *key, void *tramp, void *func);
extern int static_call_mod_init(struct module *mod);
extern int static_call_text_reserved(void *start, void *end);
-#define DEFINE_STATIC_CALL(name, _func) \
+extern long __static_call_return0(void);
+
+#define __DEFINE_STATIC_CALL(name, _func, _func_init) \
DECLARE_STATIC_CALL(name, _func); \
struct static_call_key STATIC_CALL_KEY(name) = { \
- .func = _func, \
+ .func = _func_init, \
.type = 1, \
}; \
- ARCH_DEFINE_STATIC_CALL_TRAMP(name, _func)
+ ARCH_DEFINE_STATIC_CALL_TRAMP(name, _func_init)
#define DEFINE_STATIC_CALL_NULL(name, _func) \
DECLARE_STATIC_CALL(name, _func); \
@@ -174,17 +166,23 @@ extern int static_call_text_reserved(void *start, void *end);
}; \
ARCH_DEFINE_STATIC_CALL_NULL_TRAMP(name)
-#define static_call(name) __static_call(name)
#define static_call_cond(name) (void)__static_call(name)
#define EXPORT_STATIC_CALL(name) \
EXPORT_SYMBOL(STATIC_CALL_KEY(name)); \
EXPORT_SYMBOL(STATIC_CALL_TRAMP(name))
-
#define EXPORT_STATIC_CALL_GPL(name) \
EXPORT_SYMBOL_GPL(STATIC_CALL_KEY(name)); \
EXPORT_SYMBOL_GPL(STATIC_CALL_TRAMP(name))
+/* Leave the key unexported, so modules can't change static call targets: */
+#define EXPORT_STATIC_CALL_TRAMP(name) \
+ EXPORT_SYMBOL(STATIC_CALL_TRAMP(name)); \
+ ARCH_ADD_TRAMP_KEY(name)
+#define EXPORT_STATIC_CALL_TRAMP_GPL(name) \
+ EXPORT_SYMBOL_GPL(STATIC_CALL_TRAMP(name)); \
+ ARCH_ADD_TRAMP_KEY(name)
+
#elif defined(CONFIG_HAVE_STATIC_CALL)
static inline int static_call_init(void) { return 0; }
@@ -193,12 +191,12 @@ struct static_call_key {
void *func;
};
-#define DEFINE_STATIC_CALL(name, _func) \
+#define __DEFINE_STATIC_CALL(name, _func, _func_init) \
DECLARE_STATIC_CALL(name, _func); \
struct static_call_key STATIC_CALL_KEY(name) = { \
- .func = _func, \
+ .func = _func_init, \
}; \
- ARCH_DEFINE_STATIC_CALL_TRAMP(name, _func)
+ ARCH_DEFINE_STATIC_CALL_TRAMP(name, _func_init)
#define DEFINE_STATIC_CALL_NULL(name, _func) \
DECLARE_STATIC_CALL(name, _func); \
@@ -207,7 +205,6 @@ struct static_call_key {
}; \
ARCH_DEFINE_STATIC_CALL_NULL_TRAMP(name)
-#define static_call(name) __static_call(name)
#define static_call_cond(name) (void)__static_call(name)
static inline
@@ -224,14 +221,24 @@ static inline int static_call_text_reserved(void *start, void *end)
return 0;
}
+static inline long __static_call_return0(void)
+{
+ return 0;
+}
+
#define EXPORT_STATIC_CALL(name) \
EXPORT_SYMBOL(STATIC_CALL_KEY(name)); \
EXPORT_SYMBOL(STATIC_CALL_TRAMP(name))
-
#define EXPORT_STATIC_CALL_GPL(name) \
EXPORT_SYMBOL_GPL(STATIC_CALL_KEY(name)); \
EXPORT_SYMBOL_GPL(STATIC_CALL_TRAMP(name))
+/* Leave the key unexported, so modules can't change static call targets: */
+#define EXPORT_STATIC_CALL_TRAMP(name) \
+ EXPORT_SYMBOL(STATIC_CALL_TRAMP(name))
+#define EXPORT_STATIC_CALL_TRAMP_GPL(name) \
+ EXPORT_SYMBOL_GPL(STATIC_CALL_TRAMP(name))
+
#else /* Generic implementation */
static inline int static_call_init(void) { return 0; }
@@ -240,10 +247,15 @@ struct static_call_key {
void *func;
};
-#define DEFINE_STATIC_CALL(name, _func) \
+static inline long __static_call_return0(void)
+{
+ return 0;
+}
+
+#define __DEFINE_STATIC_CALL(name, _func, _func_init) \
DECLARE_STATIC_CALL(name, _func); \
struct static_call_key STATIC_CALL_KEY(name) = { \
- .func = _func, \
+ .func = _func_init, \
}
#define DEFINE_STATIC_CALL_NULL(name, _func) \
@@ -252,9 +264,6 @@ struct static_call_key {
.func = NULL, \
}
-#define static_call(name) \
- ((typeof(STATIC_CALL_TRAMP(name))*)(STATIC_CALL_KEY(name).func))
-
static inline void __static_call_nop(void) { }
/*
@@ -295,4 +304,10 @@ static inline int static_call_text_reserved(void *start, void *end)
#endif /* CONFIG_HAVE_STATIC_CALL */
+#define DEFINE_STATIC_CALL(name, _func) \
+ __DEFINE_STATIC_CALL(name, _func, _func)
+
+#define DEFINE_STATIC_CALL_RET0(name, _func) \
+ __DEFINE_STATIC_CALL(name, _func, __static_call_return0)
+
#endif /* _LINUX_STATIC_CALL_H */
diff --git a/include/linux/static_call_types.h b/include/linux/static_call_types.h
index 89135bb35bf76..ae5662d368b98 100644
--- a/include/linux/static_call_types.h
+++ b/include/linux/static_call_types.h
@@ -4,11 +4,13 @@
#include <linux/types.h>
#include <linux/stringify.h>
+#include <linux/compiler.h>
#define STATIC_CALL_KEY_PREFIX __SCK__
#define STATIC_CALL_KEY_PREFIX_STR __stringify(STATIC_CALL_KEY_PREFIX)
#define STATIC_CALL_KEY_PREFIX_LEN (sizeof(STATIC_CALL_KEY_PREFIX_STR) - 1)
#define STATIC_CALL_KEY(name) __PASTE(STATIC_CALL_KEY_PREFIX, name)
+#define STATIC_CALL_KEY_STR(name) __stringify(STATIC_CALL_KEY(name))
#define STATIC_CALL_TRAMP_PREFIX __SCT__
#define STATIC_CALL_TRAMP_PREFIX_STR __stringify(STATIC_CALL_TRAMP_PREFIX)
@@ -32,4 +34,52 @@ struct static_call_site {
s32 key;
};
+#define DECLARE_STATIC_CALL(name, func) \
+ extern struct static_call_key STATIC_CALL_KEY(name); \
+ extern typeof(func) STATIC_CALL_TRAMP(name);
+
+#ifdef CONFIG_HAVE_STATIC_CALL
+
+#define __raw_static_call(name) (&STATIC_CALL_TRAMP(name))
+
+#ifdef CONFIG_HAVE_STATIC_CALL_INLINE
+
+/*
+ * __ADDRESSABLE() is used to ensure the key symbol doesn't get stripped from
+ * the symbol table so that objtool can reference it when it generates the
+ * .static_call_sites section.
+ */
+#define __STATIC_CALL_ADDRESSABLE(name) \
+ __ADDRESSABLE(STATIC_CALL_KEY(name))
+
+#define __static_call(name) \
+({ \
+ __STATIC_CALL_ADDRESSABLE(name); \
+ __raw_static_call(name); \
+})
+
+#else /* !CONFIG_HAVE_STATIC_CALL_INLINE */
+
+#define __STATIC_CALL_ADDRESSABLE(name)
+#define __static_call(name) __raw_static_call(name)
+
+#endif /* CONFIG_HAVE_STATIC_CALL_INLINE */
+
+#ifdef MODULE
+#define __STATIC_CALL_MOD_ADDRESSABLE(name)
+#define static_call_mod(name) __raw_static_call(name)
+#else
+#define __STATIC_CALL_MOD_ADDRESSABLE(name) __STATIC_CALL_ADDRESSABLE(name)
+#define static_call_mod(name) __static_call(name)
+#endif
+
+#define static_call(name) __static_call(name)
+
+#else
+
+#define static_call(name) \
+ ((typeof(STATIC_CALL_TRAMP(name))*)(STATIC_CALL_KEY(name).func))
+
+#endif /* CONFIG_HAVE_STATIC_CALL */
+
#endif /* _STATIC_CALL_TYPES_H */
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index cd7b5c817ba2c..2839dc9a7c01a 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -608,11 +608,11 @@ asmlinkage long sys_unshare(unsigned long unshare_flags);
/* kernel/futex.c */
asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val,
- struct __kernel_timespec __user *utime, u32 __user *uaddr2,
- u32 val3);
+ const struct __kernel_timespec __user *utime,
+ u32 __user *uaddr2, u32 val3);
asmlinkage long sys_futex_time32(u32 __user *uaddr, int op, u32 val,
- struct old_timespec32 __user *utime, u32 __user *uaddr2,
- u32 val3);
+ const struct old_timespec32 __user *utime,
+ u32 __user *uaddr2, u32 val3);
asmlinkage long sys_get_robust_list(int pid,
struct robust_list_head __user * __user *head_ptr,
size_t __user *len_ptr);
diff --git a/include/linux/timer.h b/include/linux/timer.h
index fda13c9d1256c..4118a97e62fb4 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -192,6 +192,8 @@ extern int try_to_del_timer_sync(struct timer_list *timer);
#define del_singleshot_timer_sync(t) del_timer_sync(t)
+extern bool timer_curr_running(struct timer_list *timer);
+
extern void init_timers(void);
struct hrtimer;
extern enum hrtimer_restart it_real_fn(struct hrtimer *);
diff --git a/include/linux/topology.h b/include/linux/topology.h
index ad03df1cc2667..7634cd737061c 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -48,6 +48,7 @@ int arch_update_cpu_topology(void);
/* Conform to ACPI 2.0 SLIT distance definitions */
#define LOCAL_DISTANCE 10
#define REMOTE_DISTANCE 20
+#define DISTANCE_BITS 8
#ifndef node_distance
#define node_distance(from,to) ((from) == (to) ? LOCAL_DISTANCE : REMOTE_DISTANCE)
#endif
diff --git a/include/linux/torture.h b/include/linux/torture.h
index 7f65bd1dd3079..0910c5803f35a 100644
--- a/include/linux/torture.h
+++ b/include/linux/torture.h
@@ -32,11 +32,27 @@
#define TOROUT_STRING(s) \
pr_alert("%s" TORTURE_FLAG " %s\n", torture_type, s)
#define VERBOSE_TOROUT_STRING(s) \
- do { if (verbose) pr_alert("%s" TORTURE_FLAG " %s\n", torture_type, s); } while (0)
+do { \
+ if (verbose) { \
+ verbose_torout_sleep(); \
+ pr_alert("%s" TORTURE_FLAG " %s\n", torture_type, s); \
+ } \
+} while (0)
#define VERBOSE_TOROUT_ERRSTRING(s) \
- do { if (verbose) pr_alert("%s" TORTURE_FLAG "!!! %s\n", torture_type, s); } while (0)
+do { \
+ if (verbose) { \
+ verbose_torout_sleep(); \
+ pr_alert("%s" TORTURE_FLAG "!!! %s\n", torture_type, s); \
+ } \
+} while (0)
+void verbose_torout_sleep(void);
/* Definitions for online/offline exerciser. */
+#ifdef CONFIG_HOTPLUG_CPU
+int torture_num_online_cpus(void);
+#else /* #ifdef CONFIG_HOTPLUG_CPU */
+static inline int torture_num_online_cpus(void) { return 1; }
+#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */
typedef void torture_ofl_func(void);
bool torture_offline(int cpu, long *n_onl_attempts, long *n_onl_successes,
unsigned long *sum_offl, int *min_onl, int *max_onl);
@@ -61,6 +77,13 @@ static inline void torture_random_init(struct torture_random_state *trsp)
trsp->trs_count = 0;
}
+/* Definitions for high-resolution-timer sleeps. */
+int torture_hrtimeout_ns(ktime_t baset_ns, u32 fuzzt_ns, struct torture_random_state *trsp);
+int torture_hrtimeout_us(u32 baset_us, u32 fuzzt_ns, struct torture_random_state *trsp);
+int torture_hrtimeout_ms(u32 baset_ms, u32 fuzzt_us, struct torture_random_state *trsp);
+int torture_hrtimeout_jiffies(u32 baset_j, struct torture_random_state *trsp);
+int torture_hrtimeout_s(u32 baset_s, u32 fuzzt_ms, struct torture_random_state *trsp);
+
/* Task shuffler, which causes CPUs to occasionally go idle. */
void torture_shuffle_task_register(struct task_struct *tp);
int torture_shuffle_init(long shuffint);
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index cedcda6593f61..df92211cf7718 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -241,4 +241,10 @@ pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms)
int register_vmap_purge_notifier(struct notifier_block *nb);
int unregister_vmap_purge_notifier(struct notifier_block *nb);
+#ifdef CONFIG_MMU
+bool vmalloc_dump_obj(void *object);
+#else
+static inline bool vmalloc_dump_obj(void *object) { return false; }
+#endif
+
#endif /* _LINUX_VMALLOC_H */
diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h
index 155b5cb43cfd3..5fc29400e1a2d 100644
--- a/include/trace/events/rcu.h
+++ b/include/trace/events/rcu.h
@@ -505,6 +505,32 @@ TRACE_EVENT_RCU(rcu_callback,
__entry->qlen)
);
+TRACE_EVENT_RCU(rcu_segcb_stats,
+
+ TP_PROTO(struct rcu_segcblist *rs, const char *ctx),
+
+ TP_ARGS(rs, ctx),
+
+ TP_STRUCT__entry(
+ __field(const char *, ctx)
+ __array(unsigned long, gp_seq, RCU_CBLIST_NSEGS)
+ __array(long, seglen, RCU_CBLIST_NSEGS)
+ ),
+
+ TP_fast_assign(
+ __entry->ctx = ctx;
+ memcpy(__entry->seglen, rs->seglen, RCU_CBLIST_NSEGS * sizeof(long));
+ memcpy(__entry->gp_seq, rs->gp_seq, RCU_CBLIST_NSEGS * sizeof(unsigned long));
+
+ ),
+
+ TP_printk("%s seglen: (DONE=%ld, WAIT=%ld, NEXT_READY=%ld, NEXT=%ld) "
+ "gp_seq: (DONE=%lu, WAIT=%lu, NEXT_READY=%lu, NEXT=%lu)", __entry->ctx,
+ __entry->seglen[0], __entry->seglen[1], __entry->seglen[2], __entry->seglen[3],
+ __entry->gp_seq[0], __entry->gp_seq[1], __entry->gp_seq[2], __entry->gp_seq[3])
+
+);
+
/*
* Tracepoint for the registration of a single RCU callback of the special
* kvfree() form. The first argument is the RCU type, the second argument
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index cb6f841035608..ad15e40d7f5df 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -145,12 +145,14 @@ enum perf_event_sample_format {
PERF_SAMPLE_CGROUP = 1U << 21,
PERF_SAMPLE_DATA_PAGE_SIZE = 1U << 22,
PERF_SAMPLE_CODE_PAGE_SIZE = 1U << 23,
+ PERF_SAMPLE_WEIGHT_STRUCT = 1U << 24,
- PERF_SAMPLE_MAX = 1U << 24, /* non-ABI */
+ PERF_SAMPLE_MAX = 1U << 25, /* non-ABI */
__PERF_SAMPLE_CALLCHAIN_EARLY = 1ULL << 63, /* non-ABI; internal use */
};
+#define PERF_SAMPLE_WEIGHT_TYPE (PERF_SAMPLE_WEIGHT | PERF_SAMPLE_WEIGHT_STRUCT)
/*
* values to program into branch_sample_type when PERF_SAMPLE_BRANCH is set
*
@@ -912,7 +914,24 @@ enum perf_event_type {
* char data[size];
* u64 dyn_size; } && PERF_SAMPLE_STACK_USER
*
- * { u64 weight; } && PERF_SAMPLE_WEIGHT
+ * { union perf_sample_weight
+ * {
+ * u64 full; && PERF_SAMPLE_WEIGHT
+ * #if defined(__LITTLE_ENDIAN_BITFIELD)
+ * struct {
+ * u32 var1_dw;
+ * u16 var2_w;
+ * u16 var3_w;
+ * } && PERF_SAMPLE_WEIGHT_STRUCT
+ * #elif defined(__BIG_ENDIAN_BITFIELD)
+ * struct {
+ * u16 var3_w;
+ * u16 var2_w;
+ * u32 var1_dw;
+ * } && PERF_SAMPLE_WEIGHT_STRUCT
+ * #endif
+ * }
+ * }
* { u64 data_src; } && PERF_SAMPLE_DATA_SRC
* { u64 transaction; } && PERF_SAMPLE_TRANSACTION
* { u64 abi; # enum perf_sample_regs_abi
@@ -1159,14 +1178,16 @@ union perf_mem_data_src {
mem_lvl_num:4, /* memory hierarchy level number */
mem_remote:1, /* remote */
mem_snoopx:2, /* snoop mode, ext */
- mem_rsvd:24;
+ mem_blk:3, /* access blocked */
+ mem_rsvd:21;
};
};
#elif defined(__BIG_ENDIAN_BITFIELD)
union perf_mem_data_src {
__u64 val;
struct {
- __u64 mem_rsvd:24,
+ __u64 mem_rsvd:21,
+ mem_blk:3, /* access blocked */
mem_snoopx:2, /* snoop mode, ext */
mem_remote:1, /* remote */
mem_lvl_num:4, /* memory hierarchy level number */
@@ -1249,6 +1270,12 @@ union perf_mem_data_src {
#define PERF_MEM_TLB_OS 0x40 /* OS fault handler */
#define PERF_MEM_TLB_SHIFT 26
+/* Access blocked */
+#define PERF_MEM_BLK_NA 0x01 /* not available */
+#define PERF_MEM_BLK_DATA 0x02 /* data could not be forwarded */
+#define PERF_MEM_BLK_ADDR 0x04 /* address conflict */
+#define PERF_MEM_BLK_SHIFT 40
+
#define PERF_MEM_S(a, s) \
(((__u64)PERF_MEM_##a##_##s) << PERF_MEM_##a##_SHIFT)
@@ -1280,4 +1307,23 @@ struct perf_branch_entry {
reserved:40;
};
+union perf_sample_weight {
+ __u64 full;
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+ struct {
+ __u32 var1_dw;
+ __u16 var2_w;
+ __u16 var3_w;
+ };
+#elif defined(__BIG_ENDIAN_BITFIELD)
+ struct {
+ __u16 var3_w;
+ __u16 var2_w;
+ __u32 var1_dw;
+ };
+#else
+#error "Unknown endianness"
+#endif
+};
+
#endif /* _UAPI_LINUX_PERF_EVENT_H */
diff --git a/init/Kconfig b/init/Kconfig
index 18d6af48fa5b4..4d18cfff99f14 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -525,7 +525,7 @@ config SCHED_THERMAL_PRESSURE
i.e. put less load on throttled CPUs than on non/less throttled ones.
This requires the architecture to implement
- arch_set_thermal_pressure() and arch_get_thermal_pressure().
+ arch_set_thermal_pressure() and arch_scale_thermal_pressure().
config BSD_PROCESS_ACCT
bool "BSD Process Accounting"
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index bf82259cff965..4160173016605 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -40,6 +40,7 @@ config PREEMPT
depends on !ARCH_NO_PREEMPT
select PREEMPTION
select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK
+ select PREEMPT_DYNAMIC if HAVE_PREEMPT_DYNAMIC
help
This option reduces the latency of the kernel by making
all kernel code (that is not executing in a critical section)
@@ -80,3 +81,21 @@ config PREEMPT_COUNT
config PREEMPTION
bool
select PREEMPT_COUNT
+
+config PREEMPT_DYNAMIC
+ bool
+ help
+ This option allows to define the preemption model on the kernel
+ command line parameter and thus override the default preemption
+ model defined during compile time.
+
+ The feature is primarily interesting for Linux distributions which
+ provide a pre-built kernel binary to reduce the number of kernel
+ flavors they offer while still offering different usecases.
+
+ The runtime overhead is negligible with HAVE_STATIC_CALL_INLINE enabled
+ but if runtime patching is not available for the specific architecture
+ then the potential overhead should be considered.
+
+ Interesting if you want the same pre-built kernel should be used for
+ both Server and Desktop workloads.
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 4e11e91010e11..1b6302ecbabe9 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -330,6 +330,13 @@ void lockdep_assert_cpus_held(void)
percpu_rwsem_assert_held(&cpu_hotplug_lock);
}
+#ifdef CONFIG_LOCKDEP
+int lockdep_is_cpus_held(void)
+{
+ return percpu_rwsem_is_held(&cpu_hotplug_lock);
+}
+#endif
+
static void lockdep_acquire_cpus_lock(void)
{
rwsem_acquire(&cpu_hotplug_lock.dep_map, 0, 0, _THIS_IP_);
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index f9d491b17b78b..8442e5c9cfa26 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -184,6 +184,10 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
* enabled above.
*/
local_irq_disable_exit_to_user();
+
+ /* Check if any of the above work has queued a deferred wakeup */
+ rcu_nocb_flush_deferred_wakeup();
+
ti_work = READ_ONCE(current_thread_info()->flags);
}
@@ -197,6 +201,9 @@ static void exit_to_user_mode_prepare(struct pt_regs *regs)
lockdep_assert_irqs_disabled();
+ /* Flush pending rcuog wakeup before the last need_resched() check */
+ rcu_nocb_flush_deferred_wakeup();
+
if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK))
ti_work = exit_to_user_mode_loop(regs, ti_work);
@@ -385,6 +392,9 @@ void irqentry_exit_cond_resched(void)
preempt_schedule_irq();
}
}
+#ifdef CONFIG_PREEMPT_DYNAMIC
+DEFINE_STATIC_CALL(irqentry_exit_cond_resched, irqentry_exit_cond_resched);
+#endif
noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state)
{
@@ -411,8 +421,13 @@ noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state)
}
instrumentation_begin();
- if (IS_ENABLED(CONFIG_PREEMPTION))
+ if (IS_ENABLED(CONFIG_PREEMPTION)) {
+#ifdef CONFIG_PREEMT_DYNAMIC
+ static_call(irqentry_exit_cond_resched)();
+#else
irqentry_exit_cond_resched();
+#endif
+ }
/* Covers both tracing and lockdep */
trace_hardirqs_on();
instrumentation_end();
diff --git a/kernel/events/core.c b/kernel/events/core.c
index c37401e3e5f73..129dee540a8be 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1597,50 +1597,91 @@ static void perf_event_groups_init(struct perf_event_groups *groups)
groups->index = 0;
}
+static inline struct cgroup *event_cgroup(const struct perf_event *event)
+{
+ struct cgroup *cgroup = NULL;
+
+#ifdef CONFIG_CGROUP_PERF
+ if (event->cgrp)
+ cgroup = event->cgrp->css.cgroup;
+#endif
+
+ return cgroup;
+}
+
/*
* Compare function for event groups;
*
* Implements complex key that first sorts by CPU and then by virtual index
* which provides ordering when rotating groups for the same CPU.
*/
-static bool
-perf_event_groups_less(struct perf_event *left, struct perf_event *right)
+static __always_inline int
+perf_event_groups_cmp(const int left_cpu, const struct cgroup *left_cgroup,
+ const u64 left_group_index, const struct perf_event *right)
{
- if (left->cpu < right->cpu)
- return true;
- if (left->cpu > right->cpu)
- return false;
+ if (left_cpu < right->cpu)
+ return -1;
+ if (left_cpu > right->cpu)
+ return 1;
#ifdef CONFIG_CGROUP_PERF
- if (left->cgrp != right->cgrp) {
- if (!left->cgrp || !left->cgrp->css.cgroup) {
- /*
- * Left has no cgroup but right does, no cgroups come
- * first.
- */
- return true;
- }
- if (!right->cgrp || !right->cgrp->css.cgroup) {
- /*
- * Right has no cgroup but left does, no cgroups come
- * first.
- */
- return false;
- }
- /* Two dissimilar cgroups, order by id. */
- if (left->cgrp->css.cgroup->kn->id < right->cgrp->css.cgroup->kn->id)
- return true;
+ {
+ const struct cgroup *right_cgroup = event_cgroup(right);
- return false;
+ if (left_cgroup != right_cgroup) {
+ if (!left_cgroup) {
+ /*
+ * Left has no cgroup but right does, no
+ * cgroups come first.
+ */
+ return -1;
+ }
+ if (!right_cgroup) {
+ /*
+ * Right has no cgroup but left does, no
+ * cgroups come first.
+ */
+ return 1;
+ }
+ /* Two dissimilar cgroups, order by id. */
+ if (cgroup_id(left_cgroup) < cgroup_id(right_cgroup))
+ return -1;
+
+ return 1;
+ }
}
#endif
- if (left->group_index < right->group_index)
- return true;
- if (left->group_index > right->group_index)
- return false;
+ if (left_group_index < right->group_index)
+ return -1;
+ if (left_group_index > right->group_index)
+ return 1;
- return false;
+ return 0;
+}
+
+#define __node_2_pe(node) \
+ rb_entry((node), struct perf_event, group_node)
+
+static inline bool __group_less(struct rb_node *a, const struct rb_node *b)
+{
+ struct perf_event *e = __node_2_pe(a);
+ return perf_event_groups_cmp(e->cpu, event_cgroup(e), e->group_index,
+ __node_2_pe(b)) < 0;
+}
+
+struct __group_key {
+ int cpu;
+ struct cgroup *cgroup;
+};
+
+static inline int __group_cmp(const void *key, const struct rb_node *node)
+{
+ const struct __group_key *a = key;
+ const struct perf_event *b = __node_2_pe(node);
+
+ /* partial/subtree match: @cpu, @cgroup; ignore: @group_index */
+ return perf_event_groups_cmp(a->cpu, a->cgroup, b->group_index, b);
}
/*
@@ -1652,27 +1693,9 @@ static void
perf_event_groups_insert(struct perf_event_groups *groups,
struct perf_event *event)
{
- struct perf_event *node_event;
- struct rb_node *parent;
- struct rb_node **node;
-
event->group_index = ++groups->index;
- node = &groups->tree.rb_node;
- parent = *node;
-
- while (*node) {
- parent = *node;
- node_event = container_of(*node, struct perf_event, group_node);
-
- if (perf_event_groups_less(event, node_event))
- node = &parent->rb_left;
- else
- node = &parent->rb_right;
- }
-
- rb_link_node(&event->group_node, parent, node);
- rb_insert_color(&event->group_node, &groups->tree);
+ rb_add(&event->group_node, &groups->tree, __group_less);
}
/*
@@ -1720,45 +1743,17 @@ static struct perf_event *
perf_event_groups_first(struct perf_event_groups *groups, int cpu,
struct cgroup *cgrp)
{
- struct perf_event *node_event = NULL, *match = NULL;
- struct rb_node *node = groups->tree.rb_node;
-#ifdef CONFIG_CGROUP_PERF
- u64 node_cgrp_id, cgrp_id = 0;
-
- if (cgrp)
- cgrp_id = cgrp->kn->id;
-#endif
-
- while (node) {
- node_event = container_of(node, struct perf_event, group_node);
-
- if (cpu < node_event->cpu) {
- node = node->rb_left;
- continue;
- }
- if (cpu > node_event->cpu) {
- node = node->rb_right;
- continue;
- }
-#ifdef CONFIG_CGROUP_PERF
- node_cgrp_id = 0;
- if (node_event->cgrp && node_event->cgrp->css.cgroup)
- node_cgrp_id = node_event->cgrp->css.cgroup->kn->id;
+ struct __group_key key = {
+ .cpu = cpu,
+ .cgroup = cgrp,
+ };
+ struct rb_node *node;
- if (cgrp_id < node_cgrp_id) {
- node = node->rb_left;
- continue;
- }
- if (cgrp_id > node_cgrp_id) {
- node = node->rb_right;
- continue;
- }
-#endif
- match = node_event;
- node = node->rb_left;
- }
+ node = rb_find_first(&key, &groups->tree, __group_cmp);
+ if (node)
+ return __node_2_pe(node);
- return match;
+ return NULL;
}
/*
@@ -1767,27 +1762,17 @@ perf_event_groups_first(struct perf_event_groups *groups, int cpu,
static struct perf_event *
perf_event_groups_next(struct perf_event *event)
{
- struct perf_event *next;
-#ifdef CONFIG_CGROUP_PERF
- u64 curr_cgrp_id = 0;
- u64 next_cgrp_id = 0;
-#endif
-
- next = rb_entry_safe(rb_next(&event->group_node), typeof(*event), group_node);
- if (next == NULL || next->cpu != event->cpu)
- return NULL;
-
-#ifdef CONFIG_CGROUP_PERF
- if (event->cgrp && event->cgrp->css.cgroup)
- curr_cgrp_id = event->cgrp->css.cgroup->kn->id;
+ struct __group_key key = {
+ .cpu = event->cpu,
+ .cgroup = event_cgroup(event),
+ };
+ struct rb_node *next;
- if (next->cgrp && next->cgrp->css.cgroup)
- next_cgrp_id = next->cgrp->css.cgroup->kn->id;
+ next = rb_next_match(&key, &event->group_node, __group_cmp);
+ if (next)
+ return __node_2_pe(next);
- if (curr_cgrp_id != next_cgrp_id)
- return NULL;
-#endif
- return next;
+ return NULL;
}
/*
@@ -1881,8 +1866,8 @@ static void __perf_event_header_size(struct perf_event *event, u64 sample_type)
if (sample_type & PERF_SAMPLE_PERIOD)
size += sizeof(data->period);
- if (sample_type & PERF_SAMPLE_WEIGHT)
- size += sizeof(data->weight);
+ if (sample_type & PERF_SAMPLE_WEIGHT_TYPE)
+ size += sizeof(data->weight.full);
if (sample_type & PERF_SAMPLE_READ)
size += event->read_size;
@@ -6911,8 +6896,8 @@ void perf_output_sample(struct perf_output_handle *handle,
data->regs_user.regs);
}
- if (sample_type & PERF_SAMPLE_WEIGHT)
- perf_output_put(handle, data->weight);
+ if (sample_type & PERF_SAMPLE_WEIGHT_TYPE)
+ perf_output_put(handle, data->weight.full);
if (sample_type & PERF_SAMPLE_DATA_SRC)
perf_output_put(handle, data->data_src.val);
@@ -11588,6 +11573,9 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
if (attr->sample_type & PERF_SAMPLE_CGROUP)
return -EINVAL;
#endif
+ if ((attr->sample_type & PERF_SAMPLE_WEIGHT) &&
+ (attr->sample_type & PERF_SAMPLE_WEIGHT_STRUCT))
+ return -EINVAL;
out:
return ret;
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index bf9edd8d75be5..3ea7f8f92f1db 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -613,41 +613,56 @@ static void put_uprobe(struct uprobe *uprobe)
}
}
-static int match_uprobe(struct uprobe *l, struct uprobe *r)
+static __always_inline
+int uprobe_cmp(const struct inode *l_inode, const loff_t l_offset,
+ const struct uprobe *r)
{
- if (l->inode < r->inode)
+ if (l_inode < r->inode)
return -1;
- if (l->inode > r->inode)
+ if (l_inode > r->inode)
return 1;
- if (l->offset < r->offset)
+ if (l_offset < r->offset)
return -1;
- if (l->offset > r->offset)
+ if (l_offset > r->offset)
return 1;
return 0;
}
+#define __node_2_uprobe(node) \
+ rb_entry((node), struct uprobe, rb_node)
+
+struct __uprobe_key {
+ struct inode *inode;
+ loff_t offset;
+};
+
+static inline int __uprobe_cmp_key(const void *key, const struct rb_node *b)
+{
+ const struct __uprobe_key *a = key;
+ return uprobe_cmp(a->inode, a->offset, __node_2_uprobe(b));
+}
+
+static inline int __uprobe_cmp(struct rb_node *a, const struct rb_node *b)
+{
+ struct uprobe *u = __node_2_uprobe(a);
+ return uprobe_cmp(u->inode, u->offset, __node_2_uprobe(b));
+}
+
static struct uprobe *__find_uprobe(struct inode *inode, loff_t offset)
{
- struct uprobe u = { .inode = inode, .offset = offset };
- struct rb_node *n = uprobes_tree.rb_node;
- struct uprobe *uprobe;
- int match;
+ struct __uprobe_key key = {
+ .inode = inode,
+ .offset = offset,
+ };
+ struct rb_node *node = rb_find(&key, &uprobes_tree, __uprobe_cmp_key);
- while (n) {
- uprobe = rb_entry(n, struct uprobe, rb_node);
- match = match_uprobe(&u, uprobe);
- if (!match)
- return get_uprobe(uprobe);
+ if (node)
+ return get_uprobe(__node_2_uprobe(node));
- if (match < 0)
- n = n->rb_left;
- else
- n = n->rb_right;
- }
return NULL;
}
@@ -668,32 +683,15 @@ static struct uprobe *find_uprobe(struct inode *inode, loff_t offset)
static struct uprobe *__insert_uprobe(struct uprobe *uprobe)
{
- struct rb_node **p = &uprobes_tree.rb_node;
- struct rb_node *parent = NULL;
- struct uprobe *u;
- int match;
+ struct rb_node *node;
- while (*p) {
- parent = *p;
- u = rb_entry(parent, struct uprobe, rb_node);
- match = match_uprobe(uprobe, u);
- if (!match)
- return get_uprobe(u);
+ node = rb_find_add(&uprobe->rb_node, &uprobes_tree, __uprobe_cmp);
+ if (node)
+ return get_uprobe(__node_2_uprobe(node));
- if (match < 0)
- p = &parent->rb_left;
- else
- p = &parent->rb_right;
-
- }
-
- u = NULL;
- rb_link_node(&uprobe->rb_node, parent, p);
- rb_insert_color(&uprobe->rb_node, &uprobes_tree);
/* get access + creation ref */
refcount_set(&uprobe->ref, 2);
-
- return u;
+ return NULL;
}
/*
diff --git a/kernel/futex.c b/kernel/futex.c
index 45a13eb8894e5..e68db77450392 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -3012,7 +3012,7 @@ retry:
* Success, we're done! No tricky corner cases.
*/
if (!ret)
- goto out_putkey;
+ return ret;
/*
* The atomic access to the futex value generated a
* pagefault, so retry the user-access and the wakeup:
@@ -3029,7 +3029,7 @@ retry:
* wake_futex_pi has detected invalid state. Tell user
* space.
*/
- goto out_putkey;
+ return ret;
}
/*
@@ -3050,7 +3050,7 @@ retry:
default:
WARN_ON_ONCE(1);
- goto out_putkey;
+ return ret;
}
}
@@ -3061,7 +3061,6 @@ retry:
out_unlock:
spin_unlock(&hb->lock);
-out_putkey:
return ret;
pi_retry:
@@ -3763,8 +3762,8 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
- struct __kernel_timespec __user *, utime, u32 __user *, uaddr2,
- u32, val3)
+ const struct __kernel_timespec __user *, utime,
+ u32 __user *, uaddr2, u32, val3)
{
struct timespec64 ts;
ktime_t t, *tp = NULL;
@@ -3959,7 +3958,7 @@ err_unlock:
#ifdef CONFIG_COMPAT_32BIT_TIME
SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val,
- struct old_timespec32 __user *, utime, u32 __user *, uaddr2,
+ const struct old_timespec32 __user *, utime, u32 __user *, uaddr2,
u32, val3)
{
struct timespec64 ts;
diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c
index 3994a217bde76..3bf98db9c702d 100644
--- a/kernel/kcsan/core.c
+++ b/kernel/kcsan/core.c
@@ -12,7 +12,6 @@
#include <linux/moduleparam.h>
#include <linux/percpu.h>
#include <linux/preempt.h>
-#include <linux/random.h>
#include <linux/sched.h>
#include <linux/uaccess.h>
@@ -101,7 +100,7 @@ static atomic_long_t watchpoints[CONFIG_KCSAN_NUM_WATCHPOINTS + NUM_SLOTS-1];
static DEFINE_PER_CPU(long, kcsan_skip);
/* For kcsan_prandom_u32_max(). */
-static DEFINE_PER_CPU(struct rnd_state, kcsan_rand_state);
+static DEFINE_PER_CPU(u32, kcsan_rand_state);
static __always_inline atomic_long_t *find_watchpoint(unsigned long addr,
size_t size,
@@ -275,20 +274,17 @@ should_watch(const volatile void *ptr, size_t size, int type, struct kcsan_ctx *
}
/*
- * Returns a pseudo-random number in interval [0, ep_ro). See prandom_u32_max()
- * for more details.
- *
- * The open-coded version here is using only safe primitives for all contexts
- * where we can have KCSAN instrumentation. In particular, we cannot use
- * prandom_u32() directly, as its tracepoint could cause recursion.
+ * Returns a pseudo-random number in interval [0, ep_ro). Simple linear
+ * congruential generator, using constants from "Numerical Recipes".
*/
static u32 kcsan_prandom_u32_max(u32 ep_ro)
{
- struct rnd_state *state = &get_cpu_var(kcsan_rand_state);
- const u32 res = prandom_u32_state(state);
+ u32 state = this_cpu_read(kcsan_rand_state);
+
+ state = 1664525 * state + 1013904223;
+ this_cpu_write(kcsan_rand_state, state);
- put_cpu_var(kcsan_rand_state);
- return (u32)(((u64) res * ep_ro) >> 32);
+ return state % ep_ro;
}
static inline void reset_kcsan_skip(void)
@@ -639,10 +635,14 @@ static __always_inline void check_access(const volatile void *ptr, size_t size,
void __init kcsan_init(void)
{
+ int cpu;
+
BUG_ON(!in_task());
kcsan_debugfs_init();
- prandom_seed_full_state(&kcsan_rand_state);
+
+ for_each_possible_cpu(cpu)
+ per_cpu(kcsan_rand_state, cpu) = (u32)get_cycles();
/*
* We are in the init task, and no other tasks should be running;
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
index 6d11cfb9b41f2..8838f1d7c4a2e 100644
--- a/kernel/locking/Makefile
+++ b/kernel/locking/Makefile
@@ -15,6 +15,7 @@ CFLAGS_REMOVE_mutex-debug.o = $(CC_FLAGS_FTRACE)
CFLAGS_REMOVE_rtmutex-debug.o = $(CC_FLAGS_FTRACE)
endif
+obj-$(CONFIG_DEBUG_IRQFLAGS) += irqflag-debug.o
obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
obj-$(CONFIG_LOCKDEP) += lockdep.o
ifeq ($(CONFIG_PROC_FS),y)
diff --git a/kernel/locking/irqflag-debug.c b/kernel/locking/irqflag-debug.c
new file mode 100644
index 0000000000000..810b50344d358
--- /dev/null
+++ b/kernel/locking/irqflag-debug.c
@@ -0,0 +1,13 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/bug.h>
+#include <linux/export.h>
+#include <linux/irqflags.h>
+
+noinstr void warn_bogus_irq_restore(void)
+{
+ instrumentation_begin();
+ WARN_ONCE(1, "raw_local_irq_restore() called with IRQs enabled\n");
+ instrumentation_end();
+}
+EXPORT_SYMBOL(warn_bogus_irq_restore);
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index bdaf4829098c0..c6d0c1dc62532 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -1290,6 +1290,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
class->name_version = count_matching_names(class);
class->wait_type_inner = lock->wait_type_inner;
class->wait_type_outer = lock->wait_type_outer;
+ class->lock_type = lock->lock_type;
/*
* We use RCU's safe list-add method to make
* parallel walking of the hash-list safe:
@@ -1671,6 +1672,7 @@ static inline struct lock_list *__bfs_next(struct lock_list *lock, int offset)
static enum bfs_result __bfs(struct lock_list *source_entry,
void *data,
bool (*match)(struct lock_list *entry, void *data),
+ bool (*skip)(struct lock_list *entry, void *data),
struct lock_list **target_entry,
int offset)
{
@@ -1731,7 +1733,12 @@ static enum bfs_result __bfs(struct lock_list *source_entry,
/*
* Step 3: we haven't visited this and there is a strong
* dependency path to this, so check with @match.
+ * If @skip is provide and returns true, we skip this
+ * lock (and any path this lock is in).
*/
+ if (skip && skip(lock, data))
+ continue;
+
if (match(lock, data)) {
*target_entry = lock;
return BFS_RMATCH;
@@ -1774,9 +1781,10 @@ static inline enum bfs_result
__bfs_forwards(struct lock_list *src_entry,
void *data,
bool (*match)(struct lock_list *entry, void *data),
+ bool (*skip)(struct lock_list *entry, void *data),
struct lock_list **target_entry)
{
- return __bfs(src_entry, data, match, target_entry,
+ return __bfs(src_entry, data, match, skip, target_entry,
offsetof(struct lock_class, locks_after));
}
@@ -1785,9 +1793,10 @@ static inline enum bfs_result
__bfs_backwards(struct lock_list *src_entry,
void *data,
bool (*match)(struct lock_list *entry, void *data),
+ bool (*skip)(struct lock_list *entry, void *data),
struct lock_list **target_entry)
{
- return __bfs(src_entry, data, match, target_entry,
+ return __bfs(src_entry, data, match, skip, target_entry,
offsetof(struct lock_class, locks_before));
}
@@ -2018,7 +2027,7 @@ static unsigned long __lockdep_count_forward_deps(struct lock_list *this)
unsigned long count = 0;
struct lock_list *target_entry;
- __bfs_forwards(this, (void *)&count, noop_count, &target_entry);
+ __bfs_forwards(this, (void *)&count, noop_count, NULL, &target_entry);
return count;
}
@@ -2043,7 +2052,7 @@ static unsigned long __lockdep_count_backward_deps(struct lock_list *this)
unsigned long count = 0;
struct lock_list *target_entry;
- __bfs_backwards(this, (void *)&count, noop_count, &target_entry);
+ __bfs_backwards(this, (void *)&count, noop_count, NULL, &target_entry);
return count;
}
@@ -2071,11 +2080,12 @@ unsigned long lockdep_count_backward_deps(struct lock_class *class)
static noinline enum bfs_result
check_path(struct held_lock *target, struct lock_list *src_entry,
bool (*match)(struct lock_list *entry, void *data),
+ bool (*skip)(struct lock_list *entry, void *data),
struct lock_list **target_entry)
{
enum bfs_result ret;
- ret = __bfs_forwards(src_entry, target, match, target_entry);
+ ret = __bfs_forwards(src_entry, target, match, skip, target_entry);
if (unlikely(bfs_error(ret)))
print_bfs_bug(ret);
@@ -2102,7 +2112,7 @@ check_noncircular(struct held_lock *src, struct held_lock *target,
debug_atomic_inc(nr_cyclic_checks);
- ret = check_path(target, &src_entry, hlock_conflict, &target_entry);
+ ret = check_path(target, &src_entry, hlock_conflict, NULL, &target_entry);
if (unlikely(ret == BFS_RMATCH)) {
if (!*trace) {
@@ -2120,46 +2130,6 @@ check_noncircular(struct held_lock *src, struct held_lock *target,
return ret;
}
-#ifdef CONFIG_LOCKDEP_SMALL
-/*
- * Check that the dependency graph starting at <src> can lead to
- * <target> or not. If it can, <src> -> <target> dependency is already
- * in the graph.
- *
- * Return BFS_RMATCH if it does, or BFS_RMATCH if it does not, return BFS_E* if
- * any error appears in the bfs search.
- */
-static noinline enum bfs_result
-check_redundant(struct held_lock *src, struct held_lock *target)
-{
- enum bfs_result ret;
- struct lock_list *target_entry;
- struct lock_list src_entry;
-
- bfs_init_root(&src_entry, src);
- /*
- * Special setup for check_redundant().
- *
- * To report redundant, we need to find a strong dependency path that
- * is equal to or stronger than <src> -> <target>. So if <src> is E,
- * we need to let __bfs() only search for a path starting at a -(E*)->,
- * we achieve this by setting the initial node's ->only_xr to true in
- * that case. And if <prev> is S, we set initial ->only_xr to false
- * because both -(S*)-> (equal) and -(E*)-> (stronger) are redundant.
- */
- src_entry.only_xr = src->read == 0;
-
- debug_atomic_inc(nr_redundant_checks);
-
- ret = check_path(target, &src_entry, hlock_equal, &target_entry);
-
- if (ret == BFS_RMATCH)
- debug_atomic_inc(nr_redundant);
-
- return ret;
-}
-#endif
-
#ifdef CONFIG_TRACE_IRQFLAGS
/*
@@ -2230,6 +2200,44 @@ static inline bool usage_match(struct lock_list *entry, void *mask)
return !!((entry->class->usage_mask & LOCKF_IRQ) & *(unsigned long *)mask);
}
+static inline bool usage_skip(struct lock_list *entry, void *mask)
+{
+ /*
+ * Skip local_lock() for irq inversion detection.
+ *
+ * For !RT, local_lock() is not a real lock, so it won't carry any
+ * dependency.
+ *
+ * For RT, an irq inversion happens when we have lock A and B, and on
+ * some CPU we can have:
+ *
+ * lock(A);
+ * <interrupted>
+ * lock(B);
+ *
+ * where lock(B) cannot sleep, and we have a dependency B -> ... -> A.
+ *
+ * Now we prove local_lock() cannot exist in that dependency. First we
+ * have the observation for any lock chain L1 -> ... -> Ln, for any
+ * 1 <= i <= n, Li.inner_wait_type <= L1.inner_wait_type, otherwise
+ * wait context check will complain. And since B is not a sleep lock,
+ * therefore B.inner_wait_type >= 2, and since the inner_wait_type of
+ * local_lock() is 3, which is greater than 2, therefore there is no
+ * way the local_lock() exists in the dependency B -> ... -> A.
+ *
+ * As a result, we will skip local_lock(), when we search for irq
+ * inversion bugs.
+ */
+ if (entry->class->lock_type == LD_LOCK_PERCPU) {
+ if (DEBUG_LOCKS_WARN_ON(entry->class->wait_type_inner < LD_WAIT_CONFIG))
+ return false;
+
+ return true;
+ }
+
+ return false;
+}
+
/*
* Find a node in the forwards-direction dependency sub-graph starting
* at @root->class that matches @bit.
@@ -2245,7 +2253,7 @@ find_usage_forwards(struct lock_list *root, unsigned long usage_mask,
debug_atomic_inc(nr_find_usage_forwards_checks);
- result = __bfs_forwards(root, &usage_mask, usage_match, target_entry);
+ result = __bfs_forwards(root, &usage_mask, usage_match, usage_skip, target_entry);
return result;
}
@@ -2262,7 +2270,7 @@ find_usage_backwards(struct lock_list *root, unsigned long usage_mask,
debug_atomic_inc(nr_find_usage_backwards_checks);
- result = __bfs_backwards(root, &usage_mask, usage_match, target_entry);
+ result = __bfs_backwards(root, &usage_mask, usage_match, usage_skip, target_entry);
return result;
}
@@ -2627,7 +2635,7 @@ static int check_irq_usage(struct task_struct *curr, struct held_lock *prev,
*/
bfs_init_rootb(&this, prev);
- ret = __bfs_backwards(&this, &usage_mask, usage_accumulate, NULL);
+ ret = __bfs_backwards(&this, &usage_mask, usage_accumulate, usage_skip, NULL);
if (bfs_error(ret)) {
print_bfs_bug(ret);
return 0;
@@ -2694,8 +2702,68 @@ static inline int check_irq_usage(struct task_struct *curr,
{
return 1;
}
+
+static inline bool usage_skip(struct lock_list *entry, void *mask)
+{
+ return false;
+}
+
#endif /* CONFIG_TRACE_IRQFLAGS */
+#ifdef CONFIG_LOCKDEP_SMALL
+/*
+ * Check that the dependency graph starting at <src> can lead to
+ * <target> or not. If it can, <src> -> <target> dependency is already
+ * in the graph.
+ *
+ * Return BFS_RMATCH if it does, or BFS_RMATCH if it does not, return BFS_E* if
+ * any error appears in the bfs search.
+ */
+static noinline enum bfs_result
+check_redundant(struct held_lock *src, struct held_lock *target)
+{
+ enum bfs_result ret;
+ struct lock_list *target_entry;
+ struct lock_list src_entry;
+
+ bfs_init_root(&src_entry, src);
+ /*
+ * Special setup for check_redundant().
+ *
+ * To report redundant, we need to find a strong dependency path that
+ * is equal to or stronger than <src> -> <target>. So if <src> is E,
+ * we need to let __bfs() only search for a path starting at a -(E*)->,
+ * we achieve this by setting the initial node's ->only_xr to true in
+ * that case. And if <prev> is S, we set initial ->only_xr to false
+ * because both -(S*)-> (equal) and -(E*)-> (stronger) are redundant.
+ */
+ src_entry.only_xr = src->read == 0;
+
+ debug_atomic_inc(nr_redundant_checks);
+
+ /*
+ * Note: we skip local_lock() for redundant check, because as the
+ * comment in usage_skip(), A -> local_lock() -> B and A -> B are not
+ * the same.
+ */
+ ret = check_path(target, &src_entry, hlock_equal, usage_skip, &target_entry);
+
+ if (ret == BFS_RMATCH)
+ debug_atomic_inc(nr_redundant);
+
+ return ret;
+}
+
+#else
+
+static inline enum bfs_result
+check_redundant(struct held_lock *src, struct held_lock *target)
+{
+ return BFS_RNOMATCH;
+}
+
+#endif
+
static void inc_chains(int irq_context)
{
if (irq_context & LOCK_CHAIN_HARDIRQ_CONTEXT)
@@ -2916,7 +2984,6 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
}
}
-#ifdef CONFIG_LOCKDEP_SMALL
/*
* Is the <prev> -> <next> link redundant?
*/
@@ -2925,7 +2992,6 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
return 0;
else if (ret == BFS_RMATCH)
return 2;
-#endif
if (!*trace) {
*trace = save_trace();
@@ -3707,7 +3773,7 @@ static void
print_usage_bug(struct task_struct *curr, struct held_lock *this,
enum lock_usage_bit prev_bit, enum lock_usage_bit new_bit)
{
- if (!debug_locks_off_graph_unlock() || debug_locks_silent)
+ if (!debug_locks_off() || debug_locks_silent)
return;
pr_warn("\n");
@@ -3748,6 +3814,7 @@ valid_state(struct task_struct *curr, struct held_lock *this,
enum lock_usage_bit new_bit, enum lock_usage_bit bad_bit)
{
if (unlikely(hlock_class(this)->usage_mask & (1 << bad_bit))) {
+ graph_unlock();
print_usage_bug(curr, this, bad_bit, new_bit);
return 0;
}
@@ -4503,9 +4570,9 @@ print_lock_invalid_wait_context(struct task_struct *curr,
*/
static int check_wait_context(struct task_struct *curr, struct held_lock *next)
{
- short next_inner = hlock_class(next)->wait_type_inner;
- short next_outer = hlock_class(next)->wait_type_outer;
- short curr_inner;
+ u8 next_inner = hlock_class(next)->wait_type_inner;
+ u8 next_outer = hlock_class(next)->wait_type_outer;
+ u8 curr_inner;
int depth;
if (!curr->lockdep_depth || !next_inner || next->trylock)
@@ -4528,7 +4595,7 @@ static int check_wait_context(struct task_struct *curr, struct held_lock *next)
for (; depth < curr->lockdep_depth; depth++) {
struct held_lock *prev = curr->held_locks + depth;
- short prev_inner = hlock_class(prev)->wait_type_inner;
+ u8 prev_inner = hlock_class(prev)->wait_type_inner;
if (prev_inner) {
/*
@@ -4577,9 +4644,9 @@ static inline int check_wait_context(struct task_struct *curr,
/*
* Initialize a lock instance's lock-class mapping info:
*/
-void lockdep_init_map_waits(struct lockdep_map *lock, const char *name,
+void lockdep_init_map_type(struct lockdep_map *lock, const char *name,
struct lock_class_key *key, int subclass,
- short inner, short outer)
+ u8 inner, u8 outer, u8 lock_type)
{
int i;
@@ -4602,6 +4669,7 @@ void lockdep_init_map_waits(struct lockdep_map *lock, const char *name,
lock->wait_type_outer = outer;
lock->wait_type_inner = inner;
+ lock->lock_type = lock_type;
/*
* No key, no joy, we need to hash something.
@@ -4636,7 +4704,7 @@ void lockdep_init_map_waits(struct lockdep_map *lock, const char *name,
raw_local_irq_restore(flags);
}
}
-EXPORT_SYMBOL_GPL(lockdep_init_map_waits);
+EXPORT_SYMBOL_GPL(lockdep_init_map_type);
struct lock_class_key __lockdep_no_validate__;
EXPORT_SYMBOL_GPL(__lockdep_no_validate__);
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index fd838cea39349..0ab94e1f1276a 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -27,7 +27,6 @@
#include <linux/moduleparam.h>
#include <linux/delay.h>
#include <linux/slab.h>
-#include <linux/percpu-rwsem.h>
#include <linux/torture.h>
#include <linux/reboot.h>
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 5352ce50a97e3..adb9350907688 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -86,16 +86,6 @@ bool mutex_is_locked(struct mutex *lock)
}
EXPORT_SYMBOL(mutex_is_locked);
-__must_check enum mutex_trylock_recursive_enum
-mutex_trylock_recursive(struct mutex *lock)
-{
- if (unlikely(__mutex_owner(lock) == current))
- return MUTEX_TRYLOCK_RECURSIVE;
-
- return mutex_trylock(lock);
-}
-EXPORT_SYMBOL(mutex_trylock_recursive);
-
static inline unsigned long __owner_flags(unsigned long owner)
{
return owner & MUTEX_FLAGS;
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 2f8cd616d3b29..03b21135313cb 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -267,27 +267,18 @@ rt_mutex_waiter_equal(struct rt_mutex_waiter *left,
return 1;
}
+#define __node_2_waiter(node) \
+ rb_entry((node), struct rt_mutex_waiter, tree_entry)
+
+static inline bool __waiter_less(struct rb_node *a, const struct rb_node *b)
+{
+ return rt_mutex_waiter_less(__node_2_waiter(a), __node_2_waiter(b));
+}
+
static void
rt_mutex_enqueue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter)
{
- struct rb_node **link = &lock->waiters.rb_root.rb_node;
- struct rb_node *parent = NULL;
- struct rt_mutex_waiter *entry;
- bool leftmost = true;
-
- while (*link) {
- parent = *link;
- entry = rb_entry(parent, struct rt_mutex_waiter, tree_entry);
- if (rt_mutex_waiter_less(waiter, entry)) {
- link = &parent->rb_left;
- } else {
- link = &parent->rb_right;
- leftmost = false;
- }
- }
-
- rb_link_node(&waiter->tree_entry, parent, link);
- rb_insert_color_cached(&waiter->tree_entry, &lock->waiters, leftmost);
+ rb_add_cached(&waiter->tree_entry, &lock->waiters, __waiter_less);
}
static void
@@ -300,27 +291,18 @@ rt_mutex_dequeue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter)
RB_CLEAR_NODE(&waiter->tree_entry);
}
+#define __node_2_pi_waiter(node) \
+ rb_entry((node), struct rt_mutex_waiter, pi_tree_entry)
+
+static inline bool __pi_waiter_less(struct rb_node *a, const struct rb_node *b)
+{
+ return rt_mutex_waiter_less(__node_2_pi_waiter(a), __node_2_pi_waiter(b));
+}
+
static void
rt_mutex_enqueue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter)
{
- struct rb_node **link = &task->pi_waiters.rb_root.rb_node;
- struct rb_node *parent = NULL;
- struct rt_mutex_waiter *entry;
- bool leftmost = true;
-
- while (*link) {
- parent = *link;
- entry = rb_entry(parent, struct rt_mutex_waiter, pi_tree_entry);
- if (rt_mutex_waiter_less(waiter, entry)) {
- link = &parent->rb_left;
- } else {
- link = &parent->rb_right;
- leftmost = false;
- }
- }
-
- rb_link_node(&waiter->pi_tree_entry, parent, link);
- rb_insert_color_cached(&waiter->pi_tree_entry, &task->pi_waiters, leftmost);
+ rb_add_cached(&waiter->pi_tree_entry, &task->pi_waiters, __pi_waiter_less);
}
static void
@@ -1604,8 +1586,11 @@ void __sched rt_mutex_unlock(struct rt_mutex *lock)
EXPORT_SYMBOL_GPL(rt_mutex_unlock);
/**
- * Futex variant, that since futex variants do not use the fast-path, can be
- * simple and will not need to retry.
+ * __rt_mutex_futex_unlock - Futex variant, that since futex variants
+ * do not use the fast-path, can be simple and will not need to retry.
+ *
+ * @lock: The rt_mutex to be unlocked
+ * @wake_q: The wake queue head from which to get the next lock waiter
*/
bool __sched __rt_mutex_futex_unlock(struct rt_mutex *lock,
struct wake_q_head *wake_q)
@@ -1662,13 +1647,15 @@ void rt_mutex_destroy(struct rt_mutex *lock)
EXPORT_SYMBOL_GPL(rt_mutex_destroy);
/**
- * __rt_mutex_init - initialize the rt lock
+ * __rt_mutex_init - initialize the rt_mutex
*
- * @lock: the rt lock to be initialized
+ * @lock: The rt_mutex to be initialized
+ * @name: The lock name used for debugging
+ * @key: The lock class key used for debugging
*
- * Initialize the rt lock to unlocked state.
+ * Initialize the rt_mutex to unlocked state.
*
- * Initializing of a locked rt lock is not allowed
+ * Initializing of a locked rt_mutex is not allowed
*/
void __rt_mutex_init(struct rt_mutex *lock, const char *name,
struct lock_class_key *key)
diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h
deleted file mode 100644
index e69de29bb2d1d..0000000000000
--- a/kernel/locking/rwsem.h
+++ /dev/null
diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig
index cdc57b4f6d48a..3128b7cf8e1fd 100644
--- a/kernel/rcu/Kconfig
+++ b/kernel/rcu/Kconfig
@@ -95,6 +95,7 @@ config TASKS_RUDE_RCU
config TASKS_TRACE_RCU
def_bool 0
+ select IRQ_WORK
help
This option enables a task-based RCU implementation that uses
explicit rcu_read_lock_trace() read-side markers, and allows
@@ -188,8 +189,8 @@ config RCU_FAST_NO_HZ
config RCU_BOOST
bool "Enable RCU priority boosting"
- depends on RT_MUTEXES && PREEMPT_RCU && RCU_EXPERT
- default n
+ depends on (RT_MUTEXES && PREEMPT_RCU && RCU_EXPERT) || PREEMPT_RT
+ default y if PREEMPT_RT
help
This option boosts the priority of preempted RCU readers that
block the current preemptible RCU grace period for too long.
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 59ef1ae6dc37c..bf0827d4b6593 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -378,7 +378,11 @@ do { \
smp_mb__after_unlock_lock(); \
} while (0)
-#define raw_spin_unlock_rcu_node(p) raw_spin_unlock(&ACCESS_PRIVATE(p, lock))
+#define raw_spin_unlock_rcu_node(p) \
+do { \
+ lockdep_assert_irqs_disabled(); \
+ raw_spin_unlock(&ACCESS_PRIVATE(p, lock)); \
+} while (0)
#define raw_spin_lock_irq_rcu_node(p) \
do { \
@@ -387,7 +391,10 @@ do { \
} while (0)
#define raw_spin_unlock_irq_rcu_node(p) \
- raw_spin_unlock_irq(&ACCESS_PRIVATE(p, lock))
+do { \
+ lockdep_assert_irqs_disabled(); \
+ raw_spin_unlock_irq(&ACCESS_PRIVATE(p, lock)); \
+} while (0)
#define raw_spin_lock_irqsave_rcu_node(p, flags) \
do { \
@@ -396,7 +403,10 @@ do { \
} while (0)
#define raw_spin_unlock_irqrestore_rcu_node(p, flags) \
- raw_spin_unlock_irqrestore(&ACCESS_PRIVATE(p, lock), flags)
+do { \
+ lockdep_assert_irqs_disabled(); \
+ raw_spin_unlock_irqrestore(&ACCESS_PRIVATE(p, lock), flags); \
+} while (0)
#define raw_spin_trylock_rcu_node(p) \
({ \
diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c
index 2d2a6b6b9dfb2..7f181c9675f76 100644
--- a/kernel/rcu/rcu_segcblist.c
+++ b/kernel/rcu/rcu_segcblist.c
@@ -7,10 +7,10 @@
* Authors: Paul E. McKenney <paulmck@linux.ibm.com>
*/
-#include <linux/types.h>
-#include <linux/kernel.h>
+#include <linux/cpu.h>
#include <linux/interrupt.h>
-#include <linux/rcupdate.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
#include "rcu_segcblist.h"
@@ -88,23 +88,135 @@ static void rcu_segcblist_set_len(struct rcu_segcblist *rsclp, long v)
#endif
}
+/* Get the length of a segment of the rcu_segcblist structure. */
+static long rcu_segcblist_get_seglen(struct rcu_segcblist *rsclp, int seg)
+{
+ return READ_ONCE(rsclp->seglen[seg]);
+}
+
+/* Return number of callbacks in segmented callback list by summing seglen. */
+long rcu_segcblist_n_segment_cbs(struct rcu_segcblist *rsclp)
+{
+ long len = 0;
+ int i;
+
+ for (i = RCU_DONE_TAIL; i < RCU_CBLIST_NSEGS; i++)
+ len += rcu_segcblist_get_seglen(rsclp, i);
+
+ return len;
+}
+
+/* Set the length of a segment of the rcu_segcblist structure. */
+static void rcu_segcblist_set_seglen(struct rcu_segcblist *rsclp, int seg, long v)
+{
+ WRITE_ONCE(rsclp->seglen[seg], v);
+}
+
+/* Increase the numeric length of a segment by a specified amount. */
+static void rcu_segcblist_add_seglen(struct rcu_segcblist *rsclp, int seg, long v)
+{
+ WRITE_ONCE(rsclp->seglen[seg], rsclp->seglen[seg] + v);
+}
+
+/* Move from's segment length to to's segment. */
+static void rcu_segcblist_move_seglen(struct rcu_segcblist *rsclp, int from, int to)
+{
+ long len;
+
+ if (from == to)
+ return;
+
+ len = rcu_segcblist_get_seglen(rsclp, from);
+ if (!len)
+ return;
+
+ rcu_segcblist_add_seglen(rsclp, to, len);
+ rcu_segcblist_set_seglen(rsclp, from, 0);
+}
+
+/* Increment segment's length. */
+static void rcu_segcblist_inc_seglen(struct rcu_segcblist *rsclp, int seg)
+{
+ rcu_segcblist_add_seglen(rsclp, seg, 1);
+}
+
/*
* Increase the numeric length of an rcu_segcblist structure by the
* specified amount, which can be negative. This can cause the ->len
* field to disagree with the actual number of callbacks on the structure.
* This increase is fully ordered with respect to the callers accesses
* both before and after.
+ *
+ * So why on earth is a memory barrier required both before and after
+ * the update to the ->len field???
+ *
+ * The reason is that rcu_barrier() locklessly samples each CPU's ->len
+ * field, and if a given CPU's field is zero, avoids IPIing that CPU.
+ * This can of course race with both queuing and invoking of callbacks.
+ * Failing to correctly handle either of these races could result in
+ * rcu_barrier() failing to IPI a CPU that actually had callbacks queued
+ * which rcu_barrier() was obligated to wait on. And if rcu_barrier()
+ * failed to wait on such a callback, unloading certain kernel modules
+ * would result in calls to functions whose code was no longer present in
+ * the kernel, for but one example.
+ *
+ * Therefore, ->len transitions from 1->0 and 0->1 have to be carefully
+ * ordered with respect with both list modifications and the rcu_barrier().
+ *
+ * The queuing case is CASE 1 and the invoking case is CASE 2.
+ *
+ * CASE 1: Suppose that CPU 0 has no callbacks queued, but invokes
+ * call_rcu() just as CPU 1 invokes rcu_barrier(). CPU 0's ->len field
+ * will transition from 0->1, which is one of the transitions that must
+ * be handled carefully. Without the full memory barriers after the ->len
+ * update and at the beginning of rcu_barrier(), the following could happen:
+ *
+ * CPU 0 CPU 1
+ *
+ * call_rcu().
+ * rcu_barrier() sees ->len as 0.
+ * set ->len = 1.
+ * rcu_barrier() does nothing.
+ * module is unloaded.
+ * callback invokes unloaded function!
+ *
+ * With the full barriers, any case where rcu_barrier() sees ->len as 0 will
+ * have unambiguously preceded the return from the racing call_rcu(), which
+ * means that this call_rcu() invocation is OK to not wait on. After all,
+ * you are supposed to make sure that any problematic call_rcu() invocations
+ * happen before the rcu_barrier().
+ *
+ *
+ * CASE 2: Suppose that CPU 0 is invoking its last callback just as
+ * CPU 1 invokes rcu_barrier(). CPU 0's ->len field will transition from
+ * 1->0, which is one of the transitions that must be handled carefully.
+ * Without the full memory barriers before the ->len update and at the
+ * end of rcu_barrier(), the following could happen:
+ *
+ * CPU 0 CPU 1
+ *
+ * start invoking last callback
+ * set ->len = 0 (reordered)
+ * rcu_barrier() sees ->len as 0
+ * rcu_barrier() does nothing.
+ * module is unloaded
+ * callback executing after unloaded!
+ *
+ * With the full barriers, any case where rcu_barrier() sees ->len as 0
+ * will be fully ordered after the completion of the callback function,
+ * so that the module unloading operation is completely safe.
+ *
*/
-static void rcu_segcblist_add_len(struct rcu_segcblist *rsclp, long v)
+void rcu_segcblist_add_len(struct rcu_segcblist *rsclp, long v)
{
#ifdef CONFIG_RCU_NOCB_CPU
- smp_mb__before_atomic(); /* Up to the caller! */
+ smp_mb__before_atomic(); // Read header comment above.
atomic_long_add(v, &rsclp->len);
- smp_mb__after_atomic(); /* Up to the caller! */
+ smp_mb__after_atomic(); // Read header comment above.
#else
- smp_mb(); /* Up to the caller! */
+ smp_mb(); // Read header comment above.
WRITE_ONCE(rsclp->len, rsclp->len + v);
- smp_mb(); /* Up to the caller! */
+ smp_mb(); // Read header comment above.
#endif
}
@@ -120,26 +232,6 @@ void rcu_segcblist_inc_len(struct rcu_segcblist *rsclp)
}
/*
- * Exchange the numeric length of the specified rcu_segcblist structure
- * with the specified value. This can cause the ->len field to disagree
- * with the actual number of callbacks on the structure. This exchange is
- * fully ordered with respect to the callers accesses both before and after.
- */
-static long rcu_segcblist_xchg_len(struct rcu_segcblist *rsclp, long v)
-{
-#ifdef CONFIG_RCU_NOCB_CPU
- return atomic_long_xchg(&rsclp->len, v);
-#else
- long ret = rsclp->len;
-
- smp_mb(); /* Up to the caller! */
- WRITE_ONCE(rsclp->len, v);
- smp_mb(); /* Up to the caller! */
- return ret;
-#endif
-}
-
-/*
* Initialize an rcu_segcblist structure.
*/
void rcu_segcblist_init(struct rcu_segcblist *rsclp)
@@ -149,10 +241,12 @@ void rcu_segcblist_init(struct rcu_segcblist *rsclp)
BUILD_BUG_ON(RCU_NEXT_TAIL + 1 != ARRAY_SIZE(rsclp->gp_seq));
BUILD_BUG_ON(ARRAY_SIZE(rsclp->tails) != ARRAY_SIZE(rsclp->gp_seq));
rsclp->head = NULL;
- for (i = 0; i < RCU_CBLIST_NSEGS; i++)
+ for (i = 0; i < RCU_CBLIST_NSEGS; i++) {
rsclp->tails[i] = &rsclp->head;
+ rcu_segcblist_set_seglen(rsclp, i, 0);
+ }
rcu_segcblist_set_len(rsclp, 0);
- rsclp->enabled = 1;
+ rcu_segcblist_set_flags(rsclp, SEGCBLIST_ENABLED);
}
/*
@@ -163,16 +257,21 @@ void rcu_segcblist_disable(struct rcu_segcblist *rsclp)
{
WARN_ON_ONCE(!rcu_segcblist_empty(rsclp));
WARN_ON_ONCE(rcu_segcblist_n_cbs(rsclp));
- rsclp->enabled = 0;
+ rcu_segcblist_clear_flags(rsclp, SEGCBLIST_ENABLED);
}
/*
* Mark the specified rcu_segcblist structure as offloaded. This
* structure must be empty.
*/
-void rcu_segcblist_offload(struct rcu_segcblist *rsclp)
+void rcu_segcblist_offload(struct rcu_segcblist *rsclp, bool offload)
{
- rsclp->offloaded = 1;
+ if (offload) {
+ rcu_segcblist_clear_flags(rsclp, SEGCBLIST_SOFTIRQ_ONLY);
+ rcu_segcblist_set_flags(rsclp, SEGCBLIST_OFFLOADED);
+ } else {
+ rcu_segcblist_clear_flags(rsclp, SEGCBLIST_OFFLOADED);
+ }
}
/*
@@ -245,7 +344,7 @@ void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp,
struct rcu_head *rhp)
{
rcu_segcblist_inc_len(rsclp);
- smp_mb(); /* Ensure counts are updated before callback is enqueued. */
+ rcu_segcblist_inc_seglen(rsclp, RCU_NEXT_TAIL);
rhp->next = NULL;
WRITE_ONCE(*rsclp->tails[RCU_NEXT_TAIL], rhp);
WRITE_ONCE(rsclp->tails[RCU_NEXT_TAIL], &rhp->next);
@@ -274,6 +373,7 @@ bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp,
for (i = RCU_NEXT_TAIL; i > RCU_DONE_TAIL; i--)
if (rsclp->tails[i] != rsclp->tails[i - 1])
break;
+ rcu_segcblist_inc_seglen(rsclp, i);
WRITE_ONCE(*rsclp->tails[i], rhp);
for (; i <= RCU_NEXT_TAIL; i++)
WRITE_ONCE(rsclp->tails[i], &rhp->next);
@@ -281,21 +381,6 @@ bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp,
}
/*
- * Extract only the counts from the specified rcu_segcblist structure,
- * and place them in the specified rcu_cblist structure. This function
- * supports both callback orphaning and invocation, hence the separation
- * of counts and callbacks. (Callbacks ready for invocation must be
- * orphaned and adopted separately from pending callbacks, but counts
- * apply to all callbacks. Locking must be used to make sure that
- * both orphaned-callbacks lists are consistent.)
- */
-void rcu_segcblist_extract_count(struct rcu_segcblist *rsclp,
- struct rcu_cblist *rclp)
-{
- rclp->len = rcu_segcblist_xchg_len(rsclp, 0);
-}
-
-/*
* Extract only those callbacks ready to be invoked from the specified
* rcu_segcblist structure and place them in the specified rcu_cblist
* structure.
@@ -307,6 +392,7 @@ void rcu_segcblist_extract_done_cbs(struct rcu_segcblist *rsclp,
if (!rcu_segcblist_ready_cbs(rsclp))
return; /* Nothing to do. */
+ rclp->len = rcu_segcblist_get_seglen(rsclp, RCU_DONE_TAIL);
*rclp->tail = rsclp->head;
WRITE_ONCE(rsclp->head, *rsclp->tails[RCU_DONE_TAIL]);
WRITE_ONCE(*rsclp->tails[RCU_DONE_TAIL], NULL);
@@ -314,6 +400,7 @@ void rcu_segcblist_extract_done_cbs(struct rcu_segcblist *rsclp,
for (i = RCU_CBLIST_NSEGS - 1; i >= RCU_DONE_TAIL; i--)
if (rsclp->tails[i] == rsclp->tails[RCU_DONE_TAIL])
WRITE_ONCE(rsclp->tails[i], &rsclp->head);
+ rcu_segcblist_set_seglen(rsclp, RCU_DONE_TAIL, 0);
}
/*
@@ -330,11 +417,15 @@ void rcu_segcblist_extract_pend_cbs(struct rcu_segcblist *rsclp,
if (!rcu_segcblist_pend_cbs(rsclp))
return; /* Nothing to do. */
+ rclp->len = 0;
*rclp->tail = *rsclp->tails[RCU_DONE_TAIL];
rclp->tail = rsclp->tails[RCU_NEXT_TAIL];
WRITE_ONCE(*rsclp->tails[RCU_DONE_TAIL], NULL);
- for (i = RCU_DONE_TAIL + 1; i < RCU_CBLIST_NSEGS; i++)
+ for (i = RCU_DONE_TAIL + 1; i < RCU_CBLIST_NSEGS; i++) {
+ rclp->len += rcu_segcblist_get_seglen(rsclp, i);
WRITE_ONCE(rsclp->tails[i], rsclp->tails[RCU_DONE_TAIL]);
+ rcu_segcblist_set_seglen(rsclp, i, 0);
+ }
}
/*
@@ -345,7 +436,6 @@ void rcu_segcblist_insert_count(struct rcu_segcblist *rsclp,
struct rcu_cblist *rclp)
{
rcu_segcblist_add_len(rsclp, rclp->len);
- rclp->len = 0;
}
/*
@@ -359,6 +449,7 @@ void rcu_segcblist_insert_done_cbs(struct rcu_segcblist *rsclp,
if (!rclp->head)
return; /* No callbacks to move. */
+ rcu_segcblist_add_seglen(rsclp, RCU_DONE_TAIL, rclp->len);
*rclp->tail = rsclp->head;
WRITE_ONCE(rsclp->head, rclp->head);
for (i = RCU_DONE_TAIL; i < RCU_CBLIST_NSEGS; i++)
@@ -379,6 +470,8 @@ void rcu_segcblist_insert_pend_cbs(struct rcu_segcblist *rsclp,
{
if (!rclp->head)
return; /* Nothing to do. */
+
+ rcu_segcblist_add_seglen(rsclp, RCU_NEXT_TAIL, rclp->len);
WRITE_ONCE(*rsclp->tails[RCU_NEXT_TAIL], rclp->head);
WRITE_ONCE(rsclp->tails[RCU_NEXT_TAIL], rclp->tail);
}
@@ -403,6 +496,7 @@ void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq)
if (ULONG_CMP_LT(seq, rsclp->gp_seq[i]))
break;
WRITE_ONCE(rsclp->tails[RCU_DONE_TAIL], rsclp->tails[i]);
+ rcu_segcblist_move_seglen(rsclp, i, RCU_DONE_TAIL);
}
/* If no callbacks moved, nothing more need be done. */
@@ -423,6 +517,7 @@ void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq)
if (rsclp->tails[j] == rsclp->tails[RCU_NEXT_TAIL])
break; /* No more callbacks. */
WRITE_ONCE(rsclp->tails[j], rsclp->tails[i]);
+ rcu_segcblist_move_seglen(rsclp, i, j);
rsclp->gp_seq[j] = rsclp->gp_seq[i];
}
}
@@ -444,7 +539,7 @@ void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq)
*/
bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq)
{
- int i;
+ int i, j;
WARN_ON_ONCE(!rcu_segcblist_is_enabled(rsclp));
if (rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL))
@@ -487,6 +582,10 @@ bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq)
if (rcu_segcblist_restempty(rsclp, i) || ++i >= RCU_NEXT_TAIL)
return false;
+ /* Accounting: everything below i is about to get merged into i. */
+ for (j = i + 1; j <= RCU_NEXT_TAIL; j++)
+ rcu_segcblist_move_seglen(rsclp, j, i);
+
/*
* Merge all later callbacks, including newly arrived callbacks,
* into the segment located by the for-loop above. Assign "seq"
@@ -514,13 +613,24 @@ void rcu_segcblist_merge(struct rcu_segcblist *dst_rsclp,
struct rcu_cblist donecbs;
struct rcu_cblist pendcbs;
+ lockdep_assert_cpus_held();
+
rcu_cblist_init(&donecbs);
rcu_cblist_init(&pendcbs);
- rcu_segcblist_extract_count(src_rsclp, &donecbs);
+
rcu_segcblist_extract_done_cbs(src_rsclp, &donecbs);
rcu_segcblist_extract_pend_cbs(src_rsclp, &pendcbs);
+
+ /*
+ * No need smp_mb() before setting length to 0, because CPU hotplug
+ * lock excludes rcu_barrier.
+ */
+ rcu_segcblist_set_len(src_rsclp, 0);
+
rcu_segcblist_insert_count(dst_rsclp, &donecbs);
+ rcu_segcblist_insert_count(dst_rsclp, &pendcbs);
rcu_segcblist_insert_done_cbs(dst_rsclp, &donecbs);
rcu_segcblist_insert_pend_cbs(dst_rsclp, &pendcbs);
+
rcu_segcblist_init(src_rsclp);
}
diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h
index 492262bcb5911..9a19328ff2514 100644
--- a/kernel/rcu/rcu_segcblist.h
+++ b/kernel/rcu/rcu_segcblist.h
@@ -15,6 +15,9 @@ static inline long rcu_cblist_n_cbs(struct rcu_cblist *rclp)
return READ_ONCE(rclp->len);
}
+/* Return number of callbacks in segmented callback list by summing seglen. */
+long rcu_segcblist_n_segment_cbs(struct rcu_segcblist *rsclp);
+
void rcu_cblist_init(struct rcu_cblist *rclp);
void rcu_cblist_enqueue(struct rcu_cblist *rclp, struct rcu_head *rhp);
void rcu_cblist_flush_enqueue(struct rcu_cblist *drclp,
@@ -50,19 +53,51 @@ static inline long rcu_segcblist_n_cbs(struct rcu_segcblist *rsclp)
#endif
}
+static inline void rcu_segcblist_set_flags(struct rcu_segcblist *rsclp,
+ int flags)
+{
+ rsclp->flags |= flags;
+}
+
+static inline void rcu_segcblist_clear_flags(struct rcu_segcblist *rsclp,
+ int flags)
+{
+ rsclp->flags &= ~flags;
+}
+
+static inline bool rcu_segcblist_test_flags(struct rcu_segcblist *rsclp,
+ int flags)
+{
+ return READ_ONCE(rsclp->flags) & flags;
+}
+
/*
* Is the specified rcu_segcblist enabled, for example, not corresponding
* to an offline CPU?
*/
static inline bool rcu_segcblist_is_enabled(struct rcu_segcblist *rsclp)
{
- return rsclp->enabled;
+ return rcu_segcblist_test_flags(rsclp, SEGCBLIST_ENABLED);
}
-/* Is the specified rcu_segcblist offloaded? */
+/* Is the specified rcu_segcblist offloaded, or is SEGCBLIST_SOFTIRQ_ONLY set? */
static inline bool rcu_segcblist_is_offloaded(struct rcu_segcblist *rsclp)
{
- return IS_ENABLED(CONFIG_RCU_NOCB_CPU) && rsclp->offloaded;
+ if (IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
+ !rcu_segcblist_test_flags(rsclp, SEGCBLIST_SOFTIRQ_ONLY))
+ return true;
+
+ return false;
+}
+
+static inline bool rcu_segcblist_completely_offloaded(struct rcu_segcblist *rsclp)
+{
+ int flags = SEGCBLIST_KTHREAD_CB | SEGCBLIST_KTHREAD_GP | SEGCBLIST_OFFLOADED;
+
+ if (IS_ENABLED(CONFIG_RCU_NOCB_CPU) && (rsclp->flags & flags) == flags)
+ return true;
+
+ return false;
}
/*
@@ -75,10 +110,22 @@ static inline bool rcu_segcblist_restempty(struct rcu_segcblist *rsclp, int seg)
return !READ_ONCE(*READ_ONCE(rsclp->tails[seg]));
}
+/*
+ * Is the specified segment of the specified rcu_segcblist structure
+ * empty of callbacks?
+ */
+static inline bool rcu_segcblist_segempty(struct rcu_segcblist *rsclp, int seg)
+{
+ if (seg == RCU_DONE_TAIL)
+ return &rsclp->head == rsclp->tails[RCU_DONE_TAIL];
+ return rsclp->tails[seg - 1] == rsclp->tails[seg];
+}
+
void rcu_segcblist_inc_len(struct rcu_segcblist *rsclp);
+void rcu_segcblist_add_len(struct rcu_segcblist *rsclp, long v);
void rcu_segcblist_init(struct rcu_segcblist *rsclp);
void rcu_segcblist_disable(struct rcu_segcblist *rsclp);
-void rcu_segcblist_offload(struct rcu_segcblist *rsclp);
+void rcu_segcblist_offload(struct rcu_segcblist *rsclp, bool offload);
bool rcu_segcblist_ready_cbs(struct rcu_segcblist *rsclp);
bool rcu_segcblist_pend_cbs(struct rcu_segcblist *rsclp);
struct rcu_head *rcu_segcblist_first_cb(struct rcu_segcblist *rsclp);
@@ -88,8 +135,6 @@ void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp,
struct rcu_head *rhp);
bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp,
struct rcu_head *rhp);
-void rcu_segcblist_extract_count(struct rcu_segcblist *rsclp,
- struct rcu_cblist *rclp);
void rcu_segcblist_extract_done_cbs(struct rcu_segcblist *rsclp,
struct rcu_cblist *rclp);
void rcu_segcblist_extract_pend_cbs(struct rcu_segcblist *rsclp,
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 528ed10b78fdc..99657ffa66887 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -85,6 +85,7 @@ torture_param(bool, gp_cond, false, "Use conditional/async GP wait primitives");
torture_param(bool, gp_exp, false, "Use expedited GP wait primitives");
torture_param(bool, gp_normal, false,
"Use normal (non-expedited) GP wait primitives");
+torture_param(bool, gp_poll, false, "Use polling GP wait primitives");
torture_param(bool, gp_sync, false, "Use synchronous GP wait primitives");
torture_param(int, irqreader, 1, "Allow RCU readers from irq handlers");
torture_param(int, leakpointer, 0, "Leak pointer dereferences from readers");
@@ -97,6 +98,8 @@ torture_param(int, object_debug, 0,
torture_param(int, onoff_holdoff, 0, "Time after boot before CPU hotplugs (s)");
torture_param(int, onoff_interval, 0,
"Time between CPU hotplugs (jiffies), 0=disable");
+torture_param(int, nocbs_nthreads, 0, "Number of NOCB toggle threads, 0 to disable");
+torture_param(int, nocbs_toggle, 1000, "Time between toggling nocb state (ms)");
torture_param(int, read_exit_delay, 13,
"Delay between read-then-exit episodes (s)");
torture_param(int, read_exit_burst, 16,
@@ -127,10 +130,12 @@ static char *torture_type = "rcu";
module_param(torture_type, charp, 0444);
MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, srcu, ...)");
+static int nrealnocbers;
static int nrealreaders;
static struct task_struct *writer_task;
static struct task_struct **fakewriter_tasks;
static struct task_struct **reader_tasks;
+static struct task_struct **nocb_tasks;
static struct task_struct *stats_task;
static struct task_struct *fqs_task;
static struct task_struct *boost_tasks[NR_CPUS];
@@ -142,11 +147,22 @@ static struct task_struct *read_exit_task;
#define RCU_TORTURE_PIPE_LEN 10
+// Mailbox-like structure to check RCU global memory ordering.
+struct rcu_torture_reader_check {
+ unsigned long rtc_myloops;
+ int rtc_chkrdr;
+ unsigned long rtc_chkloops;
+ int rtc_ready;
+ struct rcu_torture_reader_check *rtc_assigner;
+} ____cacheline_internodealigned_in_smp;
+
+// Update-side data structure used to check RCU readers.
struct rcu_torture {
struct rcu_head rtort_rcu;
int rtort_pipe_count;
struct list_head rtort_free;
int rtort_mbtest;
+ struct rcu_torture_reader_check *rtort_chkp;
};
static LIST_HEAD(rcu_torture_freelist);
@@ -157,10 +173,13 @@ static DEFINE_SPINLOCK(rcu_torture_lock);
static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count);
static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_batch);
static atomic_t rcu_torture_wcount[RCU_TORTURE_PIPE_LEN + 1];
+static struct rcu_torture_reader_check *rcu_torture_reader_mbchk;
static atomic_t n_rcu_torture_alloc;
static atomic_t n_rcu_torture_alloc_fail;
static atomic_t n_rcu_torture_free;
static atomic_t n_rcu_torture_mberror;
+static atomic_t n_rcu_torture_mbchk_fail;
+static atomic_t n_rcu_torture_mbchk_tries;
static atomic_t n_rcu_torture_error;
static long n_rcu_torture_barrier_error;
static long n_rcu_torture_boost_ktrerror;
@@ -174,6 +193,8 @@ static unsigned long n_read_exits;
static struct list_head rcu_torture_removed;
static unsigned long shutdown_jiffies;
static unsigned long start_gp_seq;
+static atomic_long_t n_nocb_offload;
+static atomic_long_t n_nocb_deoffload;
static int rcu_torture_writer_state;
#define RTWS_FIXED_DELAY 0
@@ -183,9 +204,11 @@ static int rcu_torture_writer_state;
#define RTWS_EXP_SYNC 4
#define RTWS_COND_GET 5
#define RTWS_COND_SYNC 6
-#define RTWS_SYNC 7
-#define RTWS_STUTTER 8
-#define RTWS_STOPPING 9
+#define RTWS_POLL_GET 7
+#define RTWS_POLL_WAIT 8
+#define RTWS_SYNC 9
+#define RTWS_STUTTER 10
+#define RTWS_STOPPING 11
static const char * const rcu_torture_writer_state_names[] = {
"RTWS_FIXED_DELAY",
"RTWS_DELAY",
@@ -194,6 +217,8 @@ static const char * const rcu_torture_writer_state_names[] = {
"RTWS_EXP_SYNC",
"RTWS_COND_GET",
"RTWS_COND_SYNC",
+ "RTWS_POLL_GET",
+ "RTWS_POLL_WAIT",
"RTWS_SYNC",
"RTWS_STUTTER",
"RTWS_STOPPING",
@@ -311,7 +336,9 @@ struct rcu_torture_ops {
void (*deferred_free)(struct rcu_torture *p);
void (*sync)(void);
void (*exp_sync)(void);
- unsigned long (*get_state)(void);
+ unsigned long (*get_gp_state)(void);
+ unsigned long (*start_gp_poll)(void);
+ bool (*poll_gp_state)(unsigned long oldstate);
void (*cond_sync)(unsigned long oldstate);
call_rcu_func_t call;
void (*cb_barrier)(void);
@@ -386,7 +413,12 @@ static bool
rcu_torture_pipe_update_one(struct rcu_torture *rp)
{
int i;
+ struct rcu_torture_reader_check *rtrcp = READ_ONCE(rp->rtort_chkp);
+ if (rtrcp) {
+ WRITE_ONCE(rp->rtort_chkp, NULL);
+ smp_store_release(&rtrcp->rtc_ready, 1); // Pair with smp_load_acquire().
+ }
i = READ_ONCE(rp->rtort_pipe_count);
if (i > RCU_TORTURE_PIPE_LEN)
i = RCU_TORTURE_PIPE_LEN;
@@ -461,7 +493,7 @@ static struct rcu_torture_ops rcu_ops = {
.deferred_free = rcu_torture_deferred_free,
.sync = synchronize_rcu,
.exp_sync = synchronize_rcu_expedited,
- .get_state = get_state_synchronize_rcu,
+ .get_gp_state = get_state_synchronize_rcu,
.cond_sync = cond_synchronize_rcu,
.call = call_rcu,
.cb_barrier = rcu_barrier,
@@ -570,6 +602,21 @@ static void srcu_torture_synchronize(void)
synchronize_srcu(srcu_ctlp);
}
+static unsigned long srcu_torture_get_gp_state(void)
+{
+ return get_state_synchronize_srcu(srcu_ctlp);
+}
+
+static unsigned long srcu_torture_start_gp_poll(void)
+{
+ return start_poll_synchronize_srcu(srcu_ctlp);
+}
+
+static bool srcu_torture_poll_gp_state(unsigned long oldstate)
+{
+ return poll_state_synchronize_srcu(srcu_ctlp, oldstate);
+}
+
static void srcu_torture_call(struct rcu_head *head,
rcu_callback_t func)
{
@@ -601,6 +648,9 @@ static struct rcu_torture_ops srcu_ops = {
.deferred_free = srcu_torture_deferred_free,
.sync = srcu_torture_synchronize,
.exp_sync = srcu_torture_synchronize_expedited,
+ .get_gp_state = srcu_torture_get_gp_state,
+ .start_gp_poll = srcu_torture_start_gp_poll,
+ .poll_gp_state = srcu_torture_poll_gp_state,
.call = srcu_torture_call,
.cb_barrier = srcu_torture_barrier,
.stats = srcu_torture_stats,
@@ -1018,42 +1068,26 @@ rcu_torture_fqs(void *arg)
return 0;
}
+// Used by writers to randomly choose from the available grace-period
+// primitives. The only purpose of the initialization is to size the array.
+static int synctype[] = { RTWS_DEF_FREE, RTWS_EXP_SYNC, RTWS_COND_GET, RTWS_POLL_GET, RTWS_SYNC };
+static int nsynctypes;
+
/*
- * RCU torture writer kthread. Repeatedly substitutes a new structure
- * for that pointed to by rcu_torture_current, freeing the old structure
- * after a series of grace periods (the "pipeline").
+ * Determine which grace-period primitives are available.
*/
-static int
-rcu_torture_writer(void *arg)
+static void rcu_torture_write_types(void)
{
- bool can_expedite = !rcu_gp_is_expedited() && !rcu_gp_is_normal();
- int expediting = 0;
- unsigned long gp_snap;
bool gp_cond1 = gp_cond, gp_exp1 = gp_exp, gp_normal1 = gp_normal;
- bool gp_sync1 = gp_sync;
- int i;
- int oldnice = task_nice(current);
- struct rcu_torture *rp;
- struct rcu_torture *old_rp;
- static DEFINE_TORTURE_RANDOM(rand);
- bool stutter_waited;
- int synctype[] = { RTWS_DEF_FREE, RTWS_EXP_SYNC,
- RTWS_COND_GET, RTWS_SYNC };
- int nsynctypes = 0;
-
- VERBOSE_TOROUT_STRING("rcu_torture_writer task started");
- if (!can_expedite)
- pr_alert("%s" TORTURE_FLAG
- " GP expediting controlled from boot/sysfs for %s.\n",
- torture_type, cur_ops->name);
+ bool gp_poll1 = gp_poll, gp_sync1 = gp_sync;
/* Initialize synctype[] array. If none set, take default. */
- if (!gp_cond1 && !gp_exp1 && !gp_normal1 && !gp_sync1)
- gp_cond1 = gp_exp1 = gp_normal1 = gp_sync1 = true;
- if (gp_cond1 && cur_ops->get_state && cur_ops->cond_sync) {
+ if (!gp_cond1 && !gp_exp1 && !gp_normal1 && !gp_poll1 && !gp_sync1)
+ gp_cond1 = gp_exp1 = gp_normal1 = gp_poll1 = gp_sync1 = true;
+ if (gp_cond1 && cur_ops->get_gp_state && cur_ops->cond_sync) {
synctype[nsynctypes++] = RTWS_COND_GET;
pr_info("%s: Testing conditional GPs.\n", __func__);
- } else if (gp_cond && (!cur_ops->get_state || !cur_ops->cond_sync)) {
+ } else if (gp_cond && (!cur_ops->get_gp_state || !cur_ops->cond_sync)) {
pr_alert("%s: gp_cond without primitives.\n", __func__);
}
if (gp_exp1 && cur_ops->exp_sync) {
@@ -1068,12 +1102,46 @@ rcu_torture_writer(void *arg)
} else if (gp_normal && !cur_ops->deferred_free) {
pr_alert("%s: gp_normal without primitives.\n", __func__);
}
+ if (gp_poll1 && cur_ops->start_gp_poll && cur_ops->poll_gp_state) {
+ synctype[nsynctypes++] = RTWS_POLL_GET;
+ pr_info("%s: Testing polling GPs.\n", __func__);
+ } else if (gp_poll && (!cur_ops->start_gp_poll || !cur_ops->poll_gp_state)) {
+ pr_alert("%s: gp_poll without primitives.\n", __func__);
+ }
if (gp_sync1 && cur_ops->sync) {
synctype[nsynctypes++] = RTWS_SYNC;
pr_info("%s: Testing normal GPs.\n", __func__);
} else if (gp_sync && !cur_ops->sync) {
pr_alert("%s: gp_sync without primitives.\n", __func__);
}
+}
+
+/*
+ * RCU torture writer kthread. Repeatedly substitutes a new structure
+ * for that pointed to by rcu_torture_current, freeing the old structure
+ * after a series of grace periods (the "pipeline").
+ */
+static int
+rcu_torture_writer(void *arg)
+{
+ bool boot_ended;
+ bool can_expedite = !rcu_gp_is_expedited() && !rcu_gp_is_normal();
+ unsigned long cookie;
+ int expediting = 0;
+ unsigned long gp_snap;
+ int i;
+ int idx;
+ int oldnice = task_nice(current);
+ struct rcu_torture *rp;
+ struct rcu_torture *old_rp;
+ static DEFINE_TORTURE_RANDOM(rand);
+ bool stutter_waited;
+
+ VERBOSE_TOROUT_STRING("rcu_torture_writer task started");
+ if (!can_expedite)
+ pr_alert("%s" TORTURE_FLAG
+ " GP expediting controlled from boot/sysfs for %s.\n",
+ torture_type, cur_ops->name);
if (WARN_ONCE(nsynctypes == 0,
"rcu_torture_writer: No update-side primitives.\n")) {
/*
@@ -1087,7 +1155,7 @@ rcu_torture_writer(void *arg)
do {
rcu_torture_writer_state = RTWS_FIXED_DELAY;
- schedule_timeout_uninterruptible(1);
+ torture_hrtimeout_us(500, 1000, &rand);
rp = rcu_torture_alloc();
if (rp == NULL)
continue;
@@ -1107,6 +1175,18 @@ rcu_torture_writer(void *arg)
atomic_inc(&rcu_torture_wcount[i]);
WRITE_ONCE(old_rp->rtort_pipe_count,
old_rp->rtort_pipe_count + 1);
+ if (cur_ops->get_gp_state && cur_ops->poll_gp_state) {
+ idx = cur_ops->readlock();
+ cookie = cur_ops->get_gp_state();
+ WARN_ONCE(rcu_torture_writer_state != RTWS_DEF_FREE &&
+ cur_ops->poll_gp_state(cookie),
+ "%s: Cookie check 1 failed %s(%d) %lu->%lu\n",
+ __func__,
+ rcu_torture_writer_state_getname(),
+ rcu_torture_writer_state,
+ cookie, cur_ops->get_gp_state());
+ cur_ops->readunlock(idx);
+ }
switch (synctype[torture_random(&rand) % nsynctypes]) {
case RTWS_DEF_FREE:
rcu_torture_writer_state = RTWS_DEF_FREE;
@@ -1119,15 +1199,21 @@ rcu_torture_writer(void *arg)
break;
case RTWS_COND_GET:
rcu_torture_writer_state = RTWS_COND_GET;
- gp_snap = cur_ops->get_state();
- i = torture_random(&rand) % 16;
- if (i != 0)
- schedule_timeout_interruptible(i);
- udelay(torture_random(&rand) % 1000);
+ gp_snap = cur_ops->get_gp_state();
+ torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand);
rcu_torture_writer_state = RTWS_COND_SYNC;
cur_ops->cond_sync(gp_snap);
rcu_torture_pipe_update(old_rp);
break;
+ case RTWS_POLL_GET:
+ rcu_torture_writer_state = RTWS_POLL_GET;
+ gp_snap = cur_ops->start_gp_poll();
+ rcu_torture_writer_state = RTWS_POLL_WAIT;
+ while (!cur_ops->poll_gp_state(gp_snap))
+ torture_hrtimeout_jiffies(torture_random(&rand) % 16,
+ &rand);
+ rcu_torture_pipe_update(old_rp);
+ break;
case RTWS_SYNC:
rcu_torture_writer_state = RTWS_SYNC;
cur_ops->sync();
@@ -1137,6 +1223,14 @@ rcu_torture_writer(void *arg)
WARN_ON_ONCE(1);
break;
}
+ if (cur_ops->get_gp_state && cur_ops->poll_gp_state)
+ WARN_ONCE(rcu_torture_writer_state != RTWS_DEF_FREE &&
+ !cur_ops->poll_gp_state(cookie),
+ "%s: Cookie check 2 failed %s(%d) %lu->%lu\n",
+ __func__,
+ rcu_torture_writer_state_getname(),
+ rcu_torture_writer_state,
+ cookie, cur_ops->get_gp_state());
}
WRITE_ONCE(rcu_torture_current_version,
rcu_torture_current_version + 1);
@@ -1155,12 +1249,13 @@ rcu_torture_writer(void *arg)
!rcu_gp_is_normal();
}
rcu_torture_writer_state = RTWS_STUTTER;
+ boot_ended = rcu_inkernel_boot_has_ended();
stutter_waited = stutter_wait("rcu_torture_writer");
if (stutter_waited &&
!READ_ONCE(rcu_fwd_cb_nodelay) &&
!cur_ops->slow_gps &&
!torture_must_stop() &&
- rcu_inkernel_boot_has_ended())
+ boot_ended)
for (i = 0; i < ARRAY_SIZE(rcu_tortures); i++)
if (list_empty(&rcu_tortures[i].rtort_free) &&
rcu_access_pointer(rcu_torture_current) !=
@@ -1194,26 +1289,43 @@ rcu_torture_writer(void *arg)
static int
rcu_torture_fakewriter(void *arg)
{
+ unsigned long gp_snap;
DEFINE_TORTURE_RANDOM(rand);
VERBOSE_TOROUT_STRING("rcu_torture_fakewriter task started");
set_user_nice(current, MAX_NICE);
do {
- schedule_timeout_uninterruptible(1 + torture_random(&rand)%10);
- udelay(torture_random(&rand) & 0x3ff);
+ torture_hrtimeout_jiffies(torture_random(&rand) % 10, &rand);
if (cur_ops->cb_barrier != NULL &&
torture_random(&rand) % (nfakewriters * 8) == 0) {
cur_ops->cb_barrier();
- } else if (gp_normal == gp_exp) {
- if (cur_ops->sync && torture_random(&rand) & 0x80)
- cur_ops->sync();
- else if (cur_ops->exp_sync)
+ } else {
+ switch (synctype[torture_random(&rand) % nsynctypes]) {
+ case RTWS_DEF_FREE:
+ break;
+ case RTWS_EXP_SYNC:
cur_ops->exp_sync();
- } else if (gp_normal && cur_ops->sync) {
- cur_ops->sync();
- } else if (cur_ops->exp_sync) {
- cur_ops->exp_sync();
+ break;
+ case RTWS_COND_GET:
+ gp_snap = cur_ops->get_gp_state();
+ torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand);
+ cur_ops->cond_sync(gp_snap);
+ break;
+ case RTWS_POLL_GET:
+ gp_snap = cur_ops->start_gp_poll();
+ while (!cur_ops->poll_gp_state(gp_snap)) {
+ torture_hrtimeout_jiffies(torture_random(&rand) % 16,
+ &rand);
+ }
+ break;
+ case RTWS_SYNC:
+ cur_ops->sync();
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ break;
+ }
}
stutter_wait("rcu_torture_fakewriter");
} while (!torture_must_stop());
@@ -1227,6 +1339,62 @@ static void rcu_torture_timer_cb(struct rcu_head *rhp)
kfree(rhp);
}
+// Set up and carry out testing of RCU's global memory ordering
+static void rcu_torture_reader_do_mbchk(long myid, struct rcu_torture *rtp,
+ struct torture_random_state *trsp)
+{
+ unsigned long loops;
+ int noc = torture_num_online_cpus();
+ int rdrchked;
+ int rdrchker;
+ struct rcu_torture_reader_check *rtrcp; // Me.
+ struct rcu_torture_reader_check *rtrcp_assigner; // Assigned us to do checking.
+ struct rcu_torture_reader_check *rtrcp_chked; // Reader being checked.
+ struct rcu_torture_reader_check *rtrcp_chker; // Reader doing checking when not me.
+
+ if (myid < 0)
+ return; // Don't try this from timer handlers.
+
+ // Increment my counter.
+ rtrcp = &rcu_torture_reader_mbchk[myid];
+ WRITE_ONCE(rtrcp->rtc_myloops, rtrcp->rtc_myloops + 1);
+
+ // Attempt to assign someone else some checking work.
+ rdrchked = torture_random(trsp) % nrealreaders;
+ rtrcp_chked = &rcu_torture_reader_mbchk[rdrchked];
+ rdrchker = torture_random(trsp) % nrealreaders;
+ rtrcp_chker = &rcu_torture_reader_mbchk[rdrchker];
+ if (rdrchked != myid && rdrchked != rdrchker && noc >= rdrchked && noc >= rdrchker &&
+ smp_load_acquire(&rtrcp->rtc_chkrdr) < 0 && // Pairs with smp_store_release below.
+ !READ_ONCE(rtp->rtort_chkp) &&
+ !smp_load_acquire(&rtrcp_chker->rtc_assigner)) { // Pairs with smp_store_release below.
+ rtrcp->rtc_chkloops = READ_ONCE(rtrcp_chked->rtc_myloops);
+ WARN_ON_ONCE(rtrcp->rtc_chkrdr >= 0);
+ rtrcp->rtc_chkrdr = rdrchked;
+ WARN_ON_ONCE(rtrcp->rtc_ready); // This gets set after the grace period ends.
+ if (cmpxchg_relaxed(&rtrcp_chker->rtc_assigner, NULL, rtrcp) ||
+ cmpxchg_relaxed(&rtp->rtort_chkp, NULL, rtrcp))
+ (void)cmpxchg_relaxed(&rtrcp_chker->rtc_assigner, rtrcp, NULL); // Back out.
+ }
+
+ // If assigned some completed work, do it!
+ rtrcp_assigner = READ_ONCE(rtrcp->rtc_assigner);
+ if (!rtrcp_assigner || !smp_load_acquire(&rtrcp_assigner->rtc_ready))
+ return; // No work or work not yet ready.
+ rdrchked = rtrcp_assigner->rtc_chkrdr;
+ if (WARN_ON_ONCE(rdrchked < 0))
+ return;
+ rtrcp_chked = &rcu_torture_reader_mbchk[rdrchked];
+ loops = READ_ONCE(rtrcp_chked->rtc_myloops);
+ atomic_inc(&n_rcu_torture_mbchk_tries);
+ if (ULONG_CMP_LT(loops, rtrcp_assigner->rtc_chkloops))
+ atomic_inc(&n_rcu_torture_mbchk_fail);
+ rtrcp_assigner->rtc_chkloops = loops + ULONG_MAX / 2;
+ rtrcp_assigner->rtc_ready = 0;
+ smp_store_release(&rtrcp->rtc_assigner, NULL); // Someone else can assign us work.
+ smp_store_release(&rtrcp_assigner->rtc_chkrdr, -1); // Assigner can again assign.
+}
+
/*
* Do one extension of an RCU read-side critical section using the
* current reader state in readstate (set to zero for initial entry
@@ -1362,8 +1530,9 @@ rcutorture_loop_extend(int *readstate, struct torture_random_state *trsp,
* no data to read. Can be invoked both from process context and
* from a timer handler.
*/
-static bool rcu_torture_one_read(struct torture_random_state *trsp)
+static bool rcu_torture_one_read(struct torture_random_state *trsp, long myid)
{
+ unsigned long cookie;
int i;
unsigned long started;
unsigned long completed;
@@ -1379,6 +1548,8 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp)
WARN_ON_ONCE(!rcu_is_watching());
newstate = rcutorture_extend_mask(readstate, trsp);
rcutorture_one_extend(&readstate, newstate, trsp, rtrsp++);
+ if (cur_ops->get_gp_state && cur_ops->poll_gp_state)
+ cookie = cur_ops->get_gp_state();
started = cur_ops->get_gp_seq();
ts = rcu_trace_clock_local();
p = rcu_dereference_check(rcu_torture_current,
@@ -1394,6 +1565,7 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp)
}
if (p->rtort_mbtest == 0)
atomic_inc(&n_rcu_torture_mberror);
+ rcu_torture_reader_do_mbchk(myid, p, trsp);
rtrsp = rcutorture_loop_extend(&readstate, trsp, rtrsp);
preempt_disable();
pipe_count = READ_ONCE(p->rtort_pipe_count);
@@ -1415,6 +1587,13 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp)
}
__this_cpu_inc(rcu_torture_batch[completed]);
preempt_enable();
+ if (cur_ops->get_gp_state && cur_ops->poll_gp_state)
+ WARN_ONCE(cur_ops->poll_gp_state(cookie),
+ "%s: Cookie check 3 failed %s(%d) %lu->%lu\n",
+ __func__,
+ rcu_torture_writer_state_getname(),
+ rcu_torture_writer_state,
+ cookie, cur_ops->get_gp_state());
rcutorture_one_extend(&readstate, 0, trsp, rtrsp);
WARN_ON_ONCE(readstate & RCUTORTURE_RDR_MASK);
// This next splat is expected behavior if leakpointer, especially
@@ -1443,7 +1622,7 @@ static DEFINE_TORTURE_RANDOM_PERCPU(rcu_torture_timer_rand);
static void rcu_torture_timer(struct timer_list *unused)
{
atomic_long_inc(&n_rcu_torture_timers);
- (void)rcu_torture_one_read(this_cpu_ptr(&rcu_torture_timer_rand));
+ (void)rcu_torture_one_read(this_cpu_ptr(&rcu_torture_timer_rand), -1);
/* Test call_rcu() invocation from interrupt handler. */
if (cur_ops->call) {
@@ -1479,13 +1658,13 @@ rcu_torture_reader(void *arg)
if (!timer_pending(&t))
mod_timer(&t, jiffies + 1);
}
- if (!rcu_torture_one_read(&rand) && !torture_must_stop())
+ if (!rcu_torture_one_read(&rand, myid) && !torture_must_stop())
schedule_timeout_interruptible(HZ);
if (time_after(jiffies, lastsleep) && !torture_must_stop()) {
- schedule_timeout_interruptible(1);
+ torture_hrtimeout_us(500, 1000, &rand);
lastsleep = jiffies + 10;
}
- while (num_online_cpus() < mynumonline && !torture_must_stop())
+ while (torture_num_online_cpus() < mynumonline && !torture_must_stop())
schedule_timeout_interruptible(HZ / 5);
stutter_wait("rcu_torture_reader");
} while (!torture_must_stop());
@@ -1499,6 +1678,53 @@ rcu_torture_reader(void *arg)
}
/*
+ * Randomly Toggle CPUs' callback-offload state. This uses hrtimers to
+ * increase race probabilities and fuzzes the interval between toggling.
+ */
+static int rcu_nocb_toggle(void *arg)
+{
+ int cpu;
+ int maxcpu = -1;
+ int oldnice = task_nice(current);
+ long r;
+ DEFINE_TORTURE_RANDOM(rand);
+ ktime_t toggle_delay;
+ unsigned long toggle_fuzz;
+ ktime_t toggle_interval = ms_to_ktime(nocbs_toggle);
+
+ VERBOSE_TOROUT_STRING("rcu_nocb_toggle task started");
+ while (!rcu_inkernel_boot_has_ended())
+ schedule_timeout_interruptible(HZ / 10);
+ for_each_online_cpu(cpu)
+ maxcpu = cpu;
+ WARN_ON(maxcpu < 0);
+ if (toggle_interval > ULONG_MAX)
+ toggle_fuzz = ULONG_MAX >> 3;
+ else
+ toggle_fuzz = toggle_interval >> 3;
+ if (toggle_fuzz <= 0)
+ toggle_fuzz = NSEC_PER_USEC;
+ do {
+ r = torture_random(&rand);
+ cpu = (r >> 4) % (maxcpu + 1);
+ if (r & 0x1) {
+ rcu_nocb_cpu_offload(cpu);
+ atomic_long_inc(&n_nocb_offload);
+ } else {
+ rcu_nocb_cpu_deoffload(cpu);
+ atomic_long_inc(&n_nocb_deoffload);
+ }
+ toggle_delay = torture_random(&rand) % toggle_fuzz + toggle_interval;
+ set_current_state(TASK_INTERRUPTIBLE);
+ schedule_hrtimeout(&toggle_delay, HRTIMER_MODE_REL);
+ if (stutter_wait("rcu_nocb_toggle"))
+ sched_set_normal(current, oldnice);
+ } while (!torture_must_stop());
+ torture_kthread_stopping("rcu_nocb_toggle");
+ return 0;
+}
+
+/*
* Print torture statistics. Caller must ensure that there is only
* one call to this function at a given time!!! This is normally
* accomplished by relying on the module system to only have one copy
@@ -1539,8 +1765,9 @@ rcu_torture_stats_print(void)
atomic_read(&n_rcu_torture_alloc),
atomic_read(&n_rcu_torture_alloc_fail),
atomic_read(&n_rcu_torture_free));
- pr_cont("rtmbe: %d rtbe: %ld rtbke: %ld rtbre: %ld ",
+ pr_cont("rtmbe: %d rtmbkf: %d/%d rtbe: %ld rtbke: %ld rtbre: %ld ",
atomic_read(&n_rcu_torture_mberror),
+ atomic_read(&n_rcu_torture_mbchk_fail), atomic_read(&n_rcu_torture_mbchk_tries),
n_rcu_torture_barrier_error,
n_rcu_torture_boost_ktrerror,
n_rcu_torture_boost_rterror);
@@ -1553,16 +1780,20 @@ rcu_torture_stats_print(void)
data_race(n_barrier_successes),
data_race(n_barrier_attempts),
data_race(n_rcu_torture_barrier_error));
- pr_cont("read-exits: %ld\n", data_race(n_read_exits));
+ pr_cont("read-exits: %ld ", data_race(n_read_exits)); // Statistic.
+ pr_cont("nocb-toggles: %ld:%ld\n",
+ atomic_long_read(&n_nocb_offload), atomic_long_read(&n_nocb_deoffload));
pr_alert("%s%s ", torture_type, TORTURE_FLAG);
if (atomic_read(&n_rcu_torture_mberror) ||
+ atomic_read(&n_rcu_torture_mbchk_fail) ||
n_rcu_torture_barrier_error || n_rcu_torture_boost_ktrerror ||
n_rcu_torture_boost_rterror || n_rcu_torture_boost_failure ||
i > 1) {
pr_cont("%s", "!!! ");
atomic_inc(&n_rcu_torture_error);
WARN_ON_ONCE(atomic_read(&n_rcu_torture_mberror));
+ WARN_ON_ONCE(atomic_read(&n_rcu_torture_mbchk_fail));
WARN_ON_ONCE(n_rcu_torture_barrier_error); // rcu_barrier()
WARN_ON_ONCE(n_rcu_torture_boost_ktrerror); // no boost kthread
WARN_ON_ONCE(n_rcu_torture_boost_rterror); // can't set RT prio
@@ -1647,7 +1878,8 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag)
"stall_cpu_block=%d "
"n_barrier_cbs=%d "
"onoff_interval=%d onoff_holdoff=%d "
- "read_exit_delay=%d read_exit_burst=%d\n",
+ "read_exit_delay=%d read_exit_burst=%d "
+ "nocbs_nthreads=%d nocbs_toggle=%d\n",
torture_type, tag, nrealreaders, nfakewriters,
stat_interval, verbose, test_no_idle_hz, shuffle_interval,
stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter,
@@ -1657,7 +1889,8 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag)
stall_cpu_block,
n_barrier_cbs,
onoff_interval, onoff_holdoff,
- read_exit_delay, read_exit_burst);
+ read_exit_delay, read_exit_burst,
+ nocbs_nthreads, nocbs_toggle);
}
static int rcutorture_booster_cleanup(unsigned int cpu)
@@ -2392,7 +2625,7 @@ static int rcu_torture_read_exit_child(void *trsp_in)
// Minimize time between reading and exiting.
while (!kthread_should_stop())
schedule_timeout_uninterruptible(1);
- (void)rcu_torture_one_read(trsp);
+ (void)rcu_torture_one_read(trsp, -1);
return 0;
}
@@ -2500,6 +2733,13 @@ rcu_torture_cleanup(void)
torture_stop_kthread(rcu_torture_stall, stall_task);
torture_stop_kthread(rcu_torture_writer, writer_task);
+ if (nocb_tasks) {
+ for (i = 0; i < nrealnocbers; i++)
+ torture_stop_kthread(rcu_nocb_toggle, nocb_tasks[i]);
+ kfree(nocb_tasks);
+ nocb_tasks = NULL;
+ }
+
if (reader_tasks) {
for (i = 0; i < nrealreaders; i++)
torture_stop_kthread(rcu_torture_reader,
@@ -2507,6 +2747,8 @@ rcu_torture_cleanup(void)
kfree(reader_tasks);
reader_tasks = NULL;
}
+ kfree(rcu_torture_reader_mbchk);
+ rcu_torture_reader_mbchk = NULL;
if (fakewriter_tasks) {
for (i = 0; i < nfakewriters; i++)
@@ -2604,6 +2846,7 @@ static void rcu_test_debug_objects(void)
#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
struct rcu_head rh1;
struct rcu_head rh2;
+ struct rcu_head *rhp = kmalloc(sizeof(*rhp), GFP_KERNEL);
init_rcu_head_on_stack(&rh1);
init_rcu_head_on_stack(&rh2);
@@ -2616,6 +2859,10 @@ static void rcu_test_debug_objects(void)
local_irq_disable(); /* Make it harder to start a new grace period. */
call_rcu(&rh2, rcu_torture_leak_cb);
call_rcu(&rh2, rcu_torture_err_cb); /* Duplicate callback. */
+ if (rhp) {
+ call_rcu(rhp, rcu_torture_leak_cb);
+ call_rcu(rhp, rcu_torture_err_cb); /* Another duplicate callback. */
+ }
local_irq_enable();
rcu_read_unlock();
preempt_enable();
@@ -2710,6 +2957,8 @@ rcu_torture_init(void)
atomic_set(&n_rcu_torture_alloc_fail, 0);
atomic_set(&n_rcu_torture_free, 0);
atomic_set(&n_rcu_torture_mberror, 0);
+ atomic_set(&n_rcu_torture_mbchk_fail, 0);
+ atomic_set(&n_rcu_torture_mbchk_tries, 0);
atomic_set(&n_rcu_torture_error, 0);
n_rcu_torture_barrier_error = 0;
n_rcu_torture_boost_ktrerror = 0;
@@ -2729,6 +2978,7 @@ rcu_torture_init(void)
/* Start up the kthreads. */
+ rcu_torture_write_types();
firsterr = torture_create_kthread(rcu_torture_writer, NULL,
writer_task);
if (firsterr)
@@ -2751,17 +3001,40 @@ rcu_torture_init(void)
}
reader_tasks = kcalloc(nrealreaders, sizeof(reader_tasks[0]),
GFP_KERNEL);
- if (reader_tasks == NULL) {
+ rcu_torture_reader_mbchk = kcalloc(nrealreaders, sizeof(*rcu_torture_reader_mbchk),
+ GFP_KERNEL);
+ if (!reader_tasks || !rcu_torture_reader_mbchk) {
VERBOSE_TOROUT_ERRSTRING("out of memory");
firsterr = -ENOMEM;
goto unwind;
}
for (i = 0; i < nrealreaders; i++) {
+ rcu_torture_reader_mbchk[i].rtc_chkrdr = -1;
firsterr = torture_create_kthread(rcu_torture_reader, (void *)i,
reader_tasks[i]);
if (firsterr)
goto unwind;
}
+ nrealnocbers = nocbs_nthreads;
+ if (WARN_ON(nrealnocbers < 0))
+ nrealnocbers = 1;
+ if (WARN_ON(nocbs_toggle < 0))
+ nocbs_toggle = HZ;
+ if (nrealnocbers > 0) {
+ nocb_tasks = kcalloc(nrealnocbers, sizeof(nocb_tasks[0]), GFP_KERNEL);
+ if (nocb_tasks == NULL) {
+ VERBOSE_TOROUT_ERRSTRING("out of memory");
+ firsterr = -ENOMEM;
+ goto unwind;
+ }
+ } else {
+ nocb_tasks = NULL;
+ }
+ for (i = 0; i < nrealnocbers; i++) {
+ firsterr = torture_create_kthread(rcu_nocb_toggle, NULL, nocb_tasks[i]);
+ if (firsterr)
+ goto unwind;
+ }
if (stat_interval > 0) {
firsterr = torture_create_kthread(rcu_torture_stats, NULL,
stats_task);
diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c
index 23ff36a66f979..02dd9767b5591 100644
--- a/kernel/rcu/refscale.c
+++ b/kernel/rcu/refscale.c
@@ -46,6 +46,18 @@
#define VERBOSE_SCALEOUT(s, x...) \
do { if (verbose) pr_alert("%s" SCALE_FLAG s, scale_type, ## x); } while (0)
+static atomic_t verbose_batch_ctr;
+
+#define VERBOSE_SCALEOUT_BATCH(s, x...) \
+do { \
+ if (verbose && \
+ (verbose_batched <= 0 || \
+ !(atomic_inc_return(&verbose_batch_ctr) % verbose_batched))) { \
+ schedule_timeout_uninterruptible(1); \
+ pr_alert("%s" SCALE_FLAG s, scale_type, ## x); \
+ } \
+} while (0)
+
#define VERBOSE_SCALEOUT_ERRSTRING(s, x...) \
do { if (verbose) pr_alert("%s" SCALE_FLAG "!!! " s, scale_type, ## x); } while (0)
@@ -57,6 +69,7 @@ module_param(scale_type, charp, 0444);
MODULE_PARM_DESC(scale_type, "Type of test (rcu, srcu, refcnt, rwsem, rwlock.");
torture_param(int, verbose, 0, "Enable verbose debugging printk()s");
+torture_param(int, verbose_batched, 0, "Batch verbose debugging printk()s");
// Wait until there are multiple CPUs before starting test.
torture_param(int, holdoff, IS_BUILTIN(CONFIG_RCU_REF_SCALE_TEST) ? 10 : 0,
@@ -368,14 +381,14 @@ ref_scale_reader(void *arg)
u64 start;
s64 duration;
- VERBOSE_SCALEOUT("ref_scale_reader %ld: task started", me);
+ VERBOSE_SCALEOUT_BATCH("ref_scale_reader %ld: task started", me);
set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids));
set_user_nice(current, MAX_NICE);
atomic_inc(&n_init);
if (holdoff)
schedule_timeout_interruptible(holdoff * HZ);
repeat:
- VERBOSE_SCALEOUT("ref_scale_reader %ld: waiting to start next experiment on cpu %d", me, smp_processor_id());
+ VERBOSE_SCALEOUT_BATCH("ref_scale_reader %ld: waiting to start next experiment on cpu %d", me, smp_processor_id());
// Wait for signal that this reader can start.
wait_event(rt->wq, (atomic_read(&nreaders_exp) && smp_load_acquire(&rt->start_reader)) ||
@@ -392,7 +405,7 @@ repeat:
while (atomic_read_acquire(&n_started))
cpu_relax();
- VERBOSE_SCALEOUT("ref_scale_reader %ld: experiment %d started", me, exp_idx);
+ VERBOSE_SCALEOUT_BATCH("ref_scale_reader %ld: experiment %d started", me, exp_idx);
// To reduce noise, do an initial cache-warming invocation, check
@@ -421,8 +434,8 @@ repeat:
if (atomic_dec_and_test(&nreaders_exp))
wake_up(&main_wq);
- VERBOSE_SCALEOUT("ref_scale_reader %ld: experiment %d ended, (readers remaining=%d)",
- me, exp_idx, atomic_read(&nreaders_exp));
+ VERBOSE_SCALEOUT_BATCH("ref_scale_reader %ld: experiment %d ended, (readers remaining=%d)",
+ me, exp_idx, atomic_read(&nreaders_exp));
if (!torture_must_stop())
goto repeat;
diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c
index 6208c1dae5c95..26344dc6483b0 100644
--- a/kernel/rcu/srcutiny.c
+++ b/kernel/rcu/srcutiny.c
@@ -34,6 +34,7 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp)
ssp->srcu_gp_running = false;
ssp->srcu_gp_waiting = false;
ssp->srcu_idx = 0;
+ ssp->srcu_idx_max = 0;
INIT_WORK(&ssp->srcu_work, srcu_drive_gp);
INIT_LIST_HEAD(&ssp->srcu_work.entry);
return 0;
@@ -84,6 +85,8 @@ void cleanup_srcu_struct(struct srcu_struct *ssp)
WARN_ON(ssp->srcu_gp_waiting);
WARN_ON(ssp->srcu_cb_head);
WARN_ON(&ssp->srcu_cb_head != ssp->srcu_cb_tail);
+ WARN_ON(ssp->srcu_idx != ssp->srcu_idx_max);
+ WARN_ON(ssp->srcu_idx & 0x1);
}
EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
@@ -114,7 +117,7 @@ void srcu_drive_gp(struct work_struct *wp)
struct srcu_struct *ssp;
ssp = container_of(wp, struct srcu_struct, srcu_work);
- if (ssp->srcu_gp_running || !READ_ONCE(ssp->srcu_cb_head))
+ if (ssp->srcu_gp_running || USHORT_CMP_GE(ssp->srcu_idx, READ_ONCE(ssp->srcu_idx_max)))
return; /* Already running or nothing to do. */
/* Remove recently arrived callbacks and wait for readers. */
@@ -124,11 +127,12 @@ void srcu_drive_gp(struct work_struct *wp)
ssp->srcu_cb_head = NULL;
ssp->srcu_cb_tail = &ssp->srcu_cb_head;
local_irq_enable();
- idx = ssp->srcu_idx;
- WRITE_ONCE(ssp->srcu_idx, !ssp->srcu_idx);
+ idx = (ssp->srcu_idx & 0x2) / 2;
+ WRITE_ONCE(ssp->srcu_idx, ssp->srcu_idx + 1);
WRITE_ONCE(ssp->srcu_gp_waiting, true); /* srcu_read_unlock() wakes! */
swait_event_exclusive(ssp->srcu_wq, !READ_ONCE(ssp->srcu_lock_nesting[idx]));
WRITE_ONCE(ssp->srcu_gp_waiting, false); /* srcu_read_unlock() cheap. */
+ WRITE_ONCE(ssp->srcu_idx, ssp->srcu_idx + 1);
/* Invoke the callbacks we removed above. */
while (lh) {
@@ -146,11 +150,27 @@ void srcu_drive_gp(struct work_struct *wp)
* straighten that out.
*/
WRITE_ONCE(ssp->srcu_gp_running, false);
- if (READ_ONCE(ssp->srcu_cb_head))
+ if (USHORT_CMP_LT(ssp->srcu_idx, READ_ONCE(ssp->srcu_idx_max)))
schedule_work(&ssp->srcu_work);
}
EXPORT_SYMBOL_GPL(srcu_drive_gp);
+static void srcu_gp_start_if_needed(struct srcu_struct *ssp)
+{
+ unsigned short cookie;
+
+ cookie = get_state_synchronize_srcu(ssp);
+ if (USHORT_CMP_GE(READ_ONCE(ssp->srcu_idx_max), cookie))
+ return;
+ WRITE_ONCE(ssp->srcu_idx_max, cookie);
+ if (!READ_ONCE(ssp->srcu_gp_running)) {
+ if (likely(srcu_init_done))
+ schedule_work(&ssp->srcu_work);
+ else if (list_empty(&ssp->srcu_work.entry))
+ list_add(&ssp->srcu_work.entry, &srcu_boot_list);
+ }
+}
+
/*
* Enqueue an SRCU callback on the specified srcu_struct structure,
* initiating grace-period processing if it is not already running.
@@ -166,12 +186,7 @@ void call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp,
*ssp->srcu_cb_tail = rhp;
ssp->srcu_cb_tail = &rhp->next;
local_irq_restore(flags);
- if (!READ_ONCE(ssp->srcu_gp_running)) {
- if (likely(srcu_init_done))
- schedule_work(&ssp->srcu_work);
- else if (list_empty(&ssp->srcu_work.entry))
- list_add(&ssp->srcu_work.entry, &srcu_boot_list);
- }
+ srcu_gp_start_if_needed(ssp);
}
EXPORT_SYMBOL_GPL(call_srcu);
@@ -190,6 +205,48 @@ void synchronize_srcu(struct srcu_struct *ssp)
}
EXPORT_SYMBOL_GPL(synchronize_srcu);
+/*
+ * get_state_synchronize_srcu - Provide an end-of-grace-period cookie
+ */
+unsigned long get_state_synchronize_srcu(struct srcu_struct *ssp)
+{
+ unsigned long ret;
+
+ barrier();
+ ret = (READ_ONCE(ssp->srcu_idx) + 3) & ~0x1;
+ barrier();
+ return ret & USHRT_MAX;
+}
+EXPORT_SYMBOL_GPL(get_state_synchronize_srcu);
+
+/*
+ * start_poll_synchronize_srcu - Provide cookie and start grace period
+ *
+ * The difference between this and get_state_synchronize_srcu() is that
+ * this function ensures that the poll_state_synchronize_srcu() will
+ * eventually return the value true.
+ */
+unsigned long start_poll_synchronize_srcu(struct srcu_struct *ssp)
+{
+ unsigned long ret = get_state_synchronize_srcu(ssp);
+
+ srcu_gp_start_if_needed(ssp);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(start_poll_synchronize_srcu);
+
+/*
+ * poll_state_synchronize_srcu - Has cookie's grace period ended?
+ */
+bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie)
+{
+ bool ret = USHORT_CMP_GE(READ_ONCE(ssp->srcu_idx), cookie);
+
+ barrier();
+ return ret;
+}
+EXPORT_SYMBOL_GPL(poll_state_synchronize_srcu);
+
/* Lockdep diagnostics. */
void __init rcu_scheduler_starting(void)
{
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 0f23d20d485a1..e26547b34ad33 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -808,6 +808,46 @@ static void srcu_leak_callback(struct rcu_head *rhp)
}
/*
+ * Start an SRCU grace period, and also queue the callback if non-NULL.
+ */
+static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp,
+ struct rcu_head *rhp, bool do_norm)
+{
+ unsigned long flags;
+ int idx;
+ bool needexp = false;
+ bool needgp = false;
+ unsigned long s;
+ struct srcu_data *sdp;
+
+ check_init_srcu_struct(ssp);
+ idx = srcu_read_lock(ssp);
+ sdp = raw_cpu_ptr(ssp->sda);
+ spin_lock_irqsave_rcu_node(sdp, flags);
+ if (rhp)
+ rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp);
+ rcu_segcblist_advance(&sdp->srcu_cblist,
+ rcu_seq_current(&ssp->srcu_gp_seq));
+ s = rcu_seq_snap(&ssp->srcu_gp_seq);
+ (void)rcu_segcblist_accelerate(&sdp->srcu_cblist, s);
+ if (ULONG_CMP_LT(sdp->srcu_gp_seq_needed, s)) {
+ sdp->srcu_gp_seq_needed = s;
+ needgp = true;
+ }
+ if (!do_norm && ULONG_CMP_LT(sdp->srcu_gp_seq_needed_exp, s)) {
+ sdp->srcu_gp_seq_needed_exp = s;
+ needexp = true;
+ }
+ spin_unlock_irqrestore_rcu_node(sdp, flags);
+ if (needgp)
+ srcu_funnel_gp_start(ssp, sdp, s, do_norm);
+ else if (needexp)
+ srcu_funnel_exp_start(ssp, sdp->mynode, s);
+ srcu_read_unlock(ssp, idx);
+ return s;
+}
+
+/*
* Enqueue an SRCU callback on the srcu_data structure associated with
* the current CPU and the specified srcu_struct structure, initiating
* grace-period processing if it is not already running.
@@ -838,14 +878,6 @@ static void srcu_leak_callback(struct rcu_head *rhp)
static void __call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp,
rcu_callback_t func, bool do_norm)
{
- unsigned long flags;
- int idx;
- bool needexp = false;
- bool needgp = false;
- unsigned long s;
- struct srcu_data *sdp;
-
- check_init_srcu_struct(ssp);
if (debug_rcu_head_queue(rhp)) {
/* Probable double call_srcu(), so leak the callback. */
WRITE_ONCE(rhp->func, srcu_leak_callback);
@@ -853,28 +885,7 @@ static void __call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp,
return;
}
rhp->func = func;
- idx = srcu_read_lock(ssp);
- sdp = raw_cpu_ptr(ssp->sda);
- spin_lock_irqsave_rcu_node(sdp, flags);
- rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp);
- rcu_segcblist_advance(&sdp->srcu_cblist,
- rcu_seq_current(&ssp->srcu_gp_seq));
- s = rcu_seq_snap(&ssp->srcu_gp_seq);
- (void)rcu_segcblist_accelerate(&sdp->srcu_cblist, s);
- if (ULONG_CMP_LT(sdp->srcu_gp_seq_needed, s)) {
- sdp->srcu_gp_seq_needed = s;
- needgp = true;
- }
- if (!do_norm && ULONG_CMP_LT(sdp->srcu_gp_seq_needed_exp, s)) {
- sdp->srcu_gp_seq_needed_exp = s;
- needexp = true;
- }
- spin_unlock_irqrestore_rcu_node(sdp, flags);
- if (needgp)
- srcu_funnel_gp_start(ssp, sdp, s, do_norm);
- else if (needexp)
- srcu_funnel_exp_start(ssp, sdp->mynode, s);
- srcu_read_unlock(ssp, idx);
+ (void)srcu_gp_start_if_needed(ssp, rhp, do_norm);
}
/**
@@ -1003,6 +1014,77 @@ void synchronize_srcu(struct srcu_struct *ssp)
}
EXPORT_SYMBOL_GPL(synchronize_srcu);
+/**
+ * get_state_synchronize_srcu - Provide an end-of-grace-period cookie
+ * @ssp: srcu_struct to provide cookie for.
+ *
+ * This function returns a cookie that can be passed to
+ * poll_state_synchronize_srcu(), which will return true if a full grace
+ * period has elapsed in the meantime. It is the caller's responsibility
+ * to make sure that grace period happens, for example, by invoking
+ * call_srcu() after return from get_state_synchronize_srcu().
+ */
+unsigned long get_state_synchronize_srcu(struct srcu_struct *ssp)
+{
+ // Any prior manipulation of SRCU-protected data must happen
+ // before the load from ->srcu_gp_seq.
+ smp_mb();
+ return rcu_seq_snap(&ssp->srcu_gp_seq);
+}
+EXPORT_SYMBOL_GPL(get_state_synchronize_srcu);
+
+/**
+ * start_poll_synchronize_srcu - Provide cookie and start grace period
+ * @ssp: srcu_struct to provide cookie for.
+ *
+ * This function returns a cookie that can be passed to
+ * poll_state_synchronize_srcu(), which will return true if a full grace
+ * period has elapsed in the meantime. Unlike get_state_synchronize_srcu(),
+ * this function also ensures that any needed SRCU grace period will be
+ * started. This convenience does come at a cost in terms of CPU overhead.
+ */
+unsigned long start_poll_synchronize_srcu(struct srcu_struct *ssp)
+{
+ return srcu_gp_start_if_needed(ssp, NULL, true);
+}
+EXPORT_SYMBOL_GPL(start_poll_synchronize_srcu);
+
+/**
+ * poll_state_synchronize_srcu - Has cookie's grace period ended?
+ * @ssp: srcu_struct to provide cookie for.
+ * @cookie: Return value from get_state_synchronize_srcu() or start_poll_synchronize_srcu().
+ *
+ * This function takes the cookie that was returned from either
+ * get_state_synchronize_srcu() or start_poll_synchronize_srcu(), and
+ * returns @true if an SRCU grace period elapsed since the time that the
+ * cookie was created.
+ *
+ * Because cookies are finite in size, wrapping/overflow is possible.
+ * This is more pronounced on 32-bit systems where cookies are 32 bits,
+ * where in theory wrapping could happen in about 14 hours assuming
+ * 25-microsecond expedited SRCU grace periods. However, a more likely
+ * overflow lower bound is on the order of 24 days in the case of
+ * one-millisecond SRCU grace periods. Of course, wrapping in a 64-bit
+ * system requires geologic timespans, as in more than seven million years
+ * even for expedited SRCU grace periods.
+ *
+ * Wrapping/overflow is much more of an issue for CONFIG_SMP=n systems
+ * that also have CONFIG_PREEMPTION=n, which selects Tiny SRCU. This uses
+ * a 16-bit cookie, which rcutorture routinely wraps in a matter of a
+ * few minutes. If this proves to be a problem, this counter will be
+ * expanded to the same size as for Tree SRCU.
+ */
+bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie)
+{
+ if (!rcu_seq_done(&ssp->srcu_gp_seq, cookie))
+ return false;
+ // Ensure that the end of the SRCU grace period happens before
+ // any subsequent code that the caller might execute.
+ smp_mb(); // ^^^
+ return true;
+}
+EXPORT_SYMBOL_GPL(poll_state_synchronize_srcu);
+
/*
* Callback function for srcu_barrier() use.
*/
@@ -1160,6 +1242,7 @@ static void srcu_advance_state(struct srcu_struct *ssp)
*/
static void srcu_invoke_callbacks(struct work_struct *work)
{
+ long len;
bool more;
struct rcu_cblist ready_cbs;
struct rcu_head *rhp;
@@ -1182,6 +1265,7 @@ static void srcu_invoke_callbacks(struct work_struct *work)
/* We are on the job! Extract and invoke ready callbacks. */
sdp->srcu_cblist_invoking = true;
rcu_segcblist_extract_done_cbs(&sdp->srcu_cblist, &ready_cbs);
+ len = ready_cbs.len;
spin_unlock_irq_rcu_node(sdp);
rhp = rcu_cblist_dequeue(&ready_cbs);
for (; rhp != NULL; rhp = rcu_cblist_dequeue(&ready_cbs)) {
@@ -1190,13 +1274,14 @@ static void srcu_invoke_callbacks(struct work_struct *work)
rhp->func(rhp);
local_bh_enable();
}
+ WARN_ON_ONCE(ready_cbs.len);
/*
* Update counts, accelerate new callbacks, and if needed,
* schedule another round of callback invocation.
*/
spin_lock_irq_rcu_node(sdp);
- rcu_segcblist_insert_count(&sdp->srcu_cblist, &ready_cbs);
+ rcu_segcblist_add_len(&sdp->srcu_cblist, -len);
(void)rcu_segcblist_accelerate(&sdp->srcu_cblist,
rcu_seq_snap(&ssp->srcu_gp_seq));
sdp->srcu_cblist_invoking = false;
diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index 36607551f9665..af7c19439f4ec 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -1224,6 +1224,82 @@ void show_rcu_tasks_gp_kthreads(void)
}
#endif /* #ifndef CONFIG_TINY_RCU */
+#ifdef CONFIG_PROVE_RCU
+struct rcu_tasks_test_desc {
+ struct rcu_head rh;
+ const char *name;
+ bool notrun;
+};
+
+static struct rcu_tasks_test_desc tests[] = {
+ {
+ .name = "call_rcu_tasks()",
+ /* If not defined, the test is skipped. */
+ .notrun = !IS_ENABLED(CONFIG_TASKS_RCU),
+ },
+ {
+ .name = "call_rcu_tasks_rude()",
+ /* If not defined, the test is skipped. */
+ .notrun = !IS_ENABLED(CONFIG_TASKS_RUDE_RCU),
+ },
+ {
+ .name = "call_rcu_tasks_trace()",
+ /* If not defined, the test is skipped. */
+ .notrun = !IS_ENABLED(CONFIG_TASKS_TRACE_RCU)
+ }
+};
+
+static void test_rcu_tasks_callback(struct rcu_head *rhp)
+{
+ struct rcu_tasks_test_desc *rttd =
+ container_of(rhp, struct rcu_tasks_test_desc, rh);
+
+ pr_info("Callback from %s invoked.\n", rttd->name);
+
+ rttd->notrun = true;
+}
+
+static void rcu_tasks_initiate_self_tests(void)
+{
+ pr_info("Running RCU-tasks wait API self tests\n");
+#ifdef CONFIG_TASKS_RCU
+ synchronize_rcu_tasks();
+ call_rcu_tasks(&tests[0].rh, test_rcu_tasks_callback);
+#endif
+
+#ifdef CONFIG_TASKS_RUDE_RCU
+ synchronize_rcu_tasks_rude();
+ call_rcu_tasks_rude(&tests[1].rh, test_rcu_tasks_callback);
+#endif
+
+#ifdef CONFIG_TASKS_TRACE_RCU
+ synchronize_rcu_tasks_trace();
+ call_rcu_tasks_trace(&tests[2].rh, test_rcu_tasks_callback);
+#endif
+}
+
+static int rcu_tasks_verify_self_tests(void)
+{
+ int ret = 0;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(tests); i++) {
+ if (!tests[i].notrun) { // still hanging.
+ pr_err("%s has been failed.\n", tests[i].name);
+ ret = -1;
+ }
+ }
+
+ if (ret)
+ WARN_ON(1);
+
+ return ret;
+}
+late_initcall(rcu_tasks_verify_self_tests);
+#else /* #ifdef CONFIG_PROVE_RCU */
+static void rcu_tasks_initiate_self_tests(void) { }
+#endif /* #else #ifdef CONFIG_PROVE_RCU */
+
void __init rcu_init_tasks_generic(void)
{
#ifdef CONFIG_TASKS_RCU
@@ -1237,6 +1313,9 @@ void __init rcu_init_tasks_generic(void)
#ifdef CONFIG_TASKS_TRACE_RCU
rcu_spawn_tasks_trace_kthread();
#endif
+
+ // Run the self-tests.
+ rcu_tasks_initiate_self_tests();
}
#else /* #ifdef CONFIG_TASKS_RCU_GENERIC */
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 40e5e3dd253e0..da6f5213fb74c 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -83,6 +83,9 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, rcu_data) = {
.dynticks_nesting = 1,
.dynticks_nmi_nesting = DYNTICK_IRQ_NONIDLE,
.dynticks = ATOMIC_INIT(RCU_DYNTICK_CTRL_CTR),
+#ifdef CONFIG_RCU_NOCB_CPU
+ .cblist.flags = SEGCBLIST_SOFTIRQ_ONLY,
+#endif
};
static struct rcu_state rcu_state = {
.level = { &rcu_state.node[0] },
@@ -100,8 +103,10 @@ static struct rcu_state rcu_state = {
static bool dump_tree;
module_param(dump_tree, bool, 0444);
/* By default, use RCU_SOFTIRQ instead of rcuc kthreads. */
-static bool use_softirq = true;
+static bool use_softirq = !IS_ENABLED(CONFIG_PREEMPT_RT);
+#ifndef CONFIG_PREEMPT_RT
module_param(use_softirq, bool, 0444);
+#endif
/* Control rcu_node-tree auto-balancing at boot time. */
static bool rcu_fanout_exact;
module_param(rcu_fanout_exact, bool, 0444);
@@ -644,7 +649,6 @@ static noinstr void rcu_eqs_enter(bool user)
trace_rcu_dyntick(TPS("Start"), rdp->dynticks_nesting, 0, atomic_read(&rdp->dynticks));
WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current));
rdp = this_cpu_ptr(&rcu_data);
- do_nocb_deferred_wakeup(rdp);
rcu_prepare_for_idle();
rcu_preempt_deferred_qs(current);
@@ -678,6 +682,50 @@ void rcu_idle_enter(void)
EXPORT_SYMBOL_GPL(rcu_idle_enter);
#ifdef CONFIG_NO_HZ_FULL
+
+#if !defined(CONFIG_GENERIC_ENTRY) || !defined(CONFIG_KVM_XFER_TO_GUEST_WORK)
+/*
+ * An empty function that will trigger a reschedule on
+ * IRQ tail once IRQs get re-enabled on userspace/guest resume.
+ */
+static void late_wakeup_func(struct irq_work *work)
+{
+}
+
+static DEFINE_PER_CPU(struct irq_work, late_wakeup_work) =
+ IRQ_WORK_INIT(late_wakeup_func);
+
+/*
+ * If either:
+ *
+ * 1) the task is about to enter in guest mode and $ARCH doesn't support KVM generic work
+ * 2) the task is about to enter in user mode and $ARCH doesn't support generic entry.
+ *
+ * In these cases the late RCU wake ups aren't supported in the resched loops and our
+ * last resort is to fire a local irq_work that will trigger a reschedule once IRQs
+ * get re-enabled again.
+ */
+noinstr static void rcu_irq_work_resched(void)
+{
+ struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
+
+ if (IS_ENABLED(CONFIG_GENERIC_ENTRY) && !(current->flags & PF_VCPU))
+ return;
+
+ if (IS_ENABLED(CONFIG_KVM_XFER_TO_GUEST_WORK) && (current->flags & PF_VCPU))
+ return;
+
+ instrumentation_begin();
+ if (do_nocb_deferred_wakeup(rdp) && need_resched()) {
+ irq_work_queue(this_cpu_ptr(&late_wakeup_work));
+ }
+ instrumentation_end();
+}
+
+#else
+static inline void rcu_irq_work_resched(void) { }
+#endif
+
/**
* rcu_user_enter - inform RCU that we are resuming userspace.
*
@@ -692,8 +740,16 @@ EXPORT_SYMBOL_GPL(rcu_idle_enter);
noinstr void rcu_user_enter(void)
{
lockdep_assert_irqs_disabled();
+
+ /*
+ * Other than generic entry implementation, we may be past the last
+ * rescheduling opportunity in the entry code. Trigger a self IPI
+ * that will fire and reschedule once we resume in user/guest mode.
+ */
+ rcu_irq_work_resched();
rcu_eqs_enter(true);
}
+
#endif /* CONFIG_NO_HZ_FULL */
/**
@@ -1495,6 +1551,8 @@ static bool rcu_accelerate_cbs(struct rcu_node *rnp, struct rcu_data *rdp)
if (!rcu_segcblist_pend_cbs(&rdp->cblist))
return false;
+ trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCbPreAcc"));
+
/*
* Callbacks are often registered with incomplete grace-period
* information. Something about the fact that getting exact
@@ -1515,6 +1573,8 @@ static bool rcu_accelerate_cbs(struct rcu_node *rnp, struct rcu_data *rdp)
else
trace_rcu_grace_period(rcu_state.name, gp_seq_req, TPS("AccReadyCB"));
+ trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCbPostAcc"));
+
return ret;
}
@@ -1765,7 +1825,7 @@ static bool rcu_gp_init(void)
* go offline later. Please also refer to "Hotplug CPU" section
* of RCU's Requirements documentation.
*/
- rcu_state.gp_state = RCU_GP_ONOFF;
+ WRITE_ONCE(rcu_state.gp_state, RCU_GP_ONOFF);
rcu_for_each_leaf_node(rnp) {
smp_mb(); // Pair with barriers used when updating ->ofl_seq to odd values.
firstseq = READ_ONCE(rnp->ofl_seq);
@@ -1831,7 +1891,7 @@ static bool rcu_gp_init(void)
* The grace period cannot complete until the initialization
* process finishes, because this kthread handles both.
*/
- rcu_state.gp_state = RCU_GP_INIT;
+ WRITE_ONCE(rcu_state.gp_state, RCU_GP_INIT);
rcu_for_each_node_breadth_first(rnp) {
rcu_gp_slow(gp_init_delay);
raw_spin_lock_irqsave_rcu_node(rnp, flags);
@@ -1930,17 +1990,22 @@ static void rcu_gp_fqs_loop(void)
ret = 0;
for (;;) {
if (!ret) {
- rcu_state.jiffies_force_qs = jiffies + j;
+ WRITE_ONCE(rcu_state.jiffies_force_qs, jiffies + j);
+ /*
+ * jiffies_force_qs before RCU_GP_WAIT_FQS state
+ * update; required for stall checks.
+ */
+ smp_wmb();
WRITE_ONCE(rcu_state.jiffies_kick_kthreads,
jiffies + (j ? 3 * j : 2));
}
trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq,
TPS("fqswait"));
- rcu_state.gp_state = RCU_GP_WAIT_FQS;
+ WRITE_ONCE(rcu_state.gp_state, RCU_GP_WAIT_FQS);
ret = swait_event_idle_timeout_exclusive(
rcu_state.gp_wq, rcu_gp_fqs_check_wake(&gf), j);
rcu_gp_torture_wait();
- rcu_state.gp_state = RCU_GP_DOING_FQS;
+ WRITE_ONCE(rcu_state.gp_state, RCU_GP_DOING_FQS);
/* Locking provides needed memory barriers. */
/* If grace period done, leave loop. */
if (!READ_ONCE(rnp->qsmask) &&
@@ -2054,7 +2119,7 @@ static void rcu_gp_cleanup(void)
trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("end"));
rcu_seq_end(&rcu_state.gp_seq);
ASSERT_EXCLUSIVE_WRITER(rcu_state.gp_seq);
- rcu_state.gp_state = RCU_GP_IDLE;
+ WRITE_ONCE(rcu_state.gp_state, RCU_GP_IDLE);
/* Check for GP requests since above loop. */
rdp = this_cpu_ptr(&rcu_data);
if (!needgp && ULONG_CMP_LT(rnp->gp_seq, rnp->gp_seq_needed)) {
@@ -2093,12 +2158,12 @@ static int __noreturn rcu_gp_kthread(void *unused)
for (;;) {
trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq,
TPS("reqwait"));
- rcu_state.gp_state = RCU_GP_WAIT_GPS;
+ WRITE_ONCE(rcu_state.gp_state, RCU_GP_WAIT_GPS);
swait_event_idle_exclusive(rcu_state.gp_wq,
READ_ONCE(rcu_state.gp_flags) &
RCU_GP_FLAG_INIT);
rcu_gp_torture_wait();
- rcu_state.gp_state = RCU_GP_DONE_GPS;
+ WRITE_ONCE(rcu_state.gp_state, RCU_GP_DONE_GPS);
/* Locking provides needed memory barrier. */
if (rcu_gp_init())
break;
@@ -2113,9 +2178,9 @@ static int __noreturn rcu_gp_kthread(void *unused)
rcu_gp_fqs_loop();
/* Handle grace-period end. */
- rcu_state.gp_state = RCU_GP_CLEANUP;
+ WRITE_ONCE(rcu_state.gp_state, RCU_GP_CLEANUP);
rcu_gp_cleanup();
- rcu_state.gp_state = RCU_GP_CLEANED;
+ WRITE_ONCE(rcu_state.gp_state, RCU_GP_CLEANED);
}
}
@@ -2430,11 +2495,12 @@ int rcutree_dead_cpu(unsigned int cpu)
static void rcu_do_batch(struct rcu_data *rdp)
{
int div;
+ bool __maybe_unused empty;
unsigned long flags;
const bool offloaded = rcu_segcblist_is_offloaded(&rdp->cblist);
struct rcu_head *rhp;
struct rcu_cblist rcl = RCU_CBLIST_INITIALIZER(rcl);
- long bl, count;
+ long bl, count = 0;
long pending, tlimit = 0;
/* If no callbacks are ready, just return. */
@@ -2471,14 +2537,18 @@ static void rcu_do_batch(struct rcu_data *rdp)
rcu_segcblist_extract_done_cbs(&rdp->cblist, &rcl);
if (offloaded)
rdp->qlen_last_fqs_check = rcu_segcblist_n_cbs(&rdp->cblist);
+
+ trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCbDequeued"));
rcu_nocb_unlock_irqrestore(rdp, flags);
/* Invoke callbacks. */
tick_dep_set_task(current, TICK_DEP_BIT_RCU);
rhp = rcu_cblist_dequeue(&rcl);
+
for (; rhp; rhp = rcu_cblist_dequeue(&rcl)) {
rcu_callback_t f;
+ count++;
debug_rcu_head_unqueue(rhp);
rcu_lock_acquire(&rcu_callback_map);
@@ -2492,21 +2562,19 @@ static void rcu_do_batch(struct rcu_data *rdp)
/*
* Stop only if limit reached and CPU has something to do.
- * Note: The rcl structure counts down from zero.
*/
- if (-rcl.len >= bl && !offloaded &&
+ if (count >= bl && !offloaded &&
(need_resched() ||
(!is_idle_task(current) && !rcu_is_callbacks_kthread())))
break;
if (unlikely(tlimit)) {
/* only call local_clock() every 32 callbacks */
- if (likely((-rcl.len & 31) || local_clock() < tlimit))
+ if (likely((count & 31) || local_clock() < tlimit))
continue;
/* Exceeded the time limit, so leave. */
break;
}
- if (offloaded) {
- WARN_ON_ONCE(in_serving_softirq());
+ if (!in_serving_softirq()) {
local_bh_enable();
lockdep_assert_irqs_enabled();
cond_resched_tasks_rcu_qs();
@@ -2517,15 +2585,13 @@ static void rcu_do_batch(struct rcu_data *rdp)
local_irq_save(flags);
rcu_nocb_lock(rdp);
- count = -rcl.len;
rdp->n_cbs_invoked += count;
trace_rcu_batch_end(rcu_state.name, count, !!rcl.head, need_resched(),
is_idle_task(current), rcu_is_callbacks_kthread());
/* Update counts and requeue any remaining callbacks. */
rcu_segcblist_insert_done_cbs(&rdp->cblist, &rcl);
- smp_mb(); /* List handling before counting for rcu_barrier(). */
- rcu_segcblist_insert_count(&rdp->cblist, &rcl);
+ rcu_segcblist_add_len(&rdp->cblist, -count);
/* Reinstate batch limit if we have worked down the excess. */
count = rcu_segcblist_n_cbs(&rdp->cblist);
@@ -2543,9 +2609,12 @@ static void rcu_do_batch(struct rcu_data *rdp)
* The following usually indicates a double call_rcu(). To track
* this down, try building with CONFIG_DEBUG_OBJECTS_RCU_HEAD=y.
*/
- WARN_ON_ONCE(count == 0 && !rcu_segcblist_empty(&rdp->cblist));
+ empty = rcu_segcblist_empty(&rdp->cblist);
+ WARN_ON_ONCE(count == 0 && !empty);
WARN_ON_ONCE(!IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
- count != 0 && rcu_segcblist_empty(&rdp->cblist));
+ count != 0 && empty);
+ WARN_ON_ONCE(count == 0 && rcu_segcblist_n_segment_cbs(&rdp->cblist) != 0);
+ WARN_ON_ONCE(!empty && rcu_segcblist_n_segment_cbs(&rdp->cblist) == 0);
rcu_nocb_unlock_irqrestore(rdp, flags);
@@ -2566,6 +2635,7 @@ static void rcu_do_batch(struct rcu_data *rdp)
void rcu_sched_clock_irq(int user)
{
trace_rcu_utilization(TPS("Start scheduler-tick"));
+ lockdep_assert_irqs_disabled();
raw_cpu_inc(rcu_data.ticks_this_gp);
/* The load-acquire pairs with the store-release setting to true. */
if (smp_load_acquire(this_cpu_ptr(&rcu_data.rcu_urgent_qs))) {
@@ -2579,6 +2649,7 @@ void rcu_sched_clock_irq(int user)
rcu_flavor_sched_clock_irq(user);
if (rcu_pending(user))
invoke_rcu_core();
+ lockdep_assert_irqs_disabled();
trace_rcu_utilization(TPS("End scheduler-tick"));
}
@@ -2688,7 +2759,7 @@ static __latent_entropy void rcu_core(void)
unsigned long flags;
struct rcu_data *rdp = raw_cpu_ptr(&rcu_data);
struct rcu_node *rnp = rdp->mynode;
- const bool offloaded = rcu_segcblist_is_offloaded(&rdp->cblist);
+ const bool do_batch = !rcu_segcblist_completely_offloaded(&rdp->cblist);
if (cpu_is_offline(smp_processor_id()))
return;
@@ -2708,17 +2779,17 @@ static __latent_entropy void rcu_core(void)
/* No grace period and unregistered callbacks? */
if (!rcu_gp_in_progress() &&
- rcu_segcblist_is_enabled(&rdp->cblist) && !offloaded) {
- local_irq_save(flags);
+ rcu_segcblist_is_enabled(&rdp->cblist) && do_batch) {
+ rcu_nocb_lock_irqsave(rdp, flags);
if (!rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL))
rcu_accelerate_cbs_unlocked(rnp, rdp);
- local_irq_restore(flags);
+ rcu_nocb_unlock_irqrestore(rdp, flags);
}
rcu_check_gp_start_stall(rnp, rdp, rcu_jiffies_till_stall_check());
/* If there are callbacks ready, invoke them. */
- if (!offloaded && rcu_segcblist_ready_cbs(&rdp->cblist) &&
+ if (do_batch && rcu_segcblist_ready_cbs(&rdp->cblist) &&
likely(READ_ONCE(rcu_scheduler_fully_active)))
rcu_do_batch(rdp);
@@ -2941,6 +3012,7 @@ static void check_cb_ovld(struct rcu_data *rdp)
static void
__call_rcu(struct rcu_head *head, rcu_callback_t func)
{
+ static atomic_t doublefrees;
unsigned long flags;
struct rcu_data *rdp;
bool was_alldone;
@@ -2954,8 +3026,10 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func)
* Use rcu:rcu_callback trace event to find the previous
* time callback was passed to __call_rcu().
*/
- WARN_ONCE(1, "__call_rcu(): Double-freed CB %p->%pS()!!!\n",
- head, head->func);
+ if (atomic_inc_return(&doublefrees) < 4) {
+ pr_err("%s(): Double-freed CB %p->%pS()!!! ", __func__, head, head->func);
+ mem_dump_obj(head);
+ }
WRITE_ONCE(head->func, rcu_leak_callback);
return;
}
@@ -2989,6 +3063,8 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func)
trace_rcu_callback(rcu_state.name, head,
rcu_segcblist_n_cbs(&rdp->cblist));
+ trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCBQueued"));
+
/* Go handle any RCU core processing required. */
if (unlikely(rcu_segcblist_is_offloaded(&rdp->cblist))) {
__call_rcu_nocb_wake(rdp, was_alldone, flags); /* unlocks */
@@ -3498,6 +3574,7 @@ void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
goto unlock_return;
}
+ kasan_record_aux_stack(ptr);
success = kvfree_call_rcu_add_ptr_to_bulk(krcp, ptr);
if (!success) {
run_page_cache_worker(krcp);
@@ -3747,6 +3824,8 @@ static int rcu_pending(int user)
struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
struct rcu_node *rnp = rdp->mynode;
+ lockdep_assert_irqs_disabled();
+
/* Check for CPU stalls, if enabled. */
check_cpu_stall(rdp);
@@ -4001,12 +4080,18 @@ int rcutree_prepare_cpu(unsigned int cpu)
rdp->qlen_last_fqs_check = 0;
rdp->n_force_qs_snap = rcu_state.n_force_qs;
rdp->blimit = blimit;
- if (rcu_segcblist_empty(&rdp->cblist) && /* No early-boot CBs? */
- !rcu_segcblist_is_offloaded(&rdp->cblist))
- rcu_segcblist_init(&rdp->cblist); /* Re-enable callbacks. */
rdp->dynticks_nesting = 1; /* CPU not up, no tearing. */
rcu_dynticks_eqs_online();
raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */
+ /*
+ * Lock in case the CB/GP kthreads are still around handling
+ * old callbacks (longer term we should flush all callbacks
+ * before completing CPU offline)
+ */
+ rcu_nocb_lock(rdp);
+ if (rcu_segcblist_empty(&rdp->cblist)) /* No early-boot CBs? */
+ rcu_segcblist_init(&rdp->cblist); /* Re-enable callbacks. */
+ rcu_nocb_unlock(rdp);
/*
* Add CPU to leaf rcu_node pending-online bitmask. Any needed
@@ -4159,6 +4244,9 @@ void rcu_report_dead(unsigned int cpu)
struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */
+ // Do any dangling deferred wakeups.
+ do_nocb_deferred_wakeup(rdp);
+
/* QS for any half-done expedited grace period. */
preempt_disable();
rcu_report_exp_rdp(this_cpu_ptr(&rcu_data));
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 7708ed161f4a2..71821d59d95c5 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -201,6 +201,7 @@ struct rcu_data {
/* 5) Callback offloading. */
#ifdef CONFIG_RCU_NOCB_CPU
struct swait_queue_head nocb_cb_wq; /* For nocb kthreads to sleep on. */
+ struct swait_queue_head nocb_state_wq; /* For offloading state changes */
struct task_struct *nocb_gp_kthread;
raw_spinlock_t nocb_lock; /* Guard following pair of fields. */
atomic_t nocb_lock_contended; /* Contention experienced. */
@@ -256,6 +257,7 @@ struct rcu_data {
};
/* Values for nocb_defer_wakeup field in struct rcu_data. */
+#define RCU_NOCB_WAKE_OFF -1
#define RCU_NOCB_WAKE_NOT 0
#define RCU_NOCB_WAKE 1
#define RCU_NOCB_WAKE_FORCE 2
@@ -433,7 +435,7 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_empty,
unsigned long flags);
static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp);
-static void do_nocb_deferred_wakeup(struct rcu_data *rdp);
+static bool do_nocb_deferred_wakeup(struct rcu_data *rdp);
static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp);
static void rcu_spawn_cpu_nocb_kthread(int cpu);
static void __init rcu_spawn_nocb_kthreads(void);
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index 8760b6ead770a..6c6ff06d4ae65 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -545,7 +545,7 @@ static void synchronize_rcu_expedited_wait(void)
data_race(rnp_root->expmask),
".T"[!!data_race(rnp_root->exp_tasks)]);
if (ndetected) {
- pr_err("blocking rcu_node structures:");
+ pr_err("blocking rcu_node structures (internal RCU debug):");
rcu_for_each_node_breadth_first(rnp) {
if (rnp == rnp_root)
continue; /* printed unconditionally */
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 7e291ce0a1d6f..2d603771c7dce 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -682,6 +682,7 @@ static void rcu_flavor_sched_clock_irq(int user)
{
struct task_struct *t = current;
+ lockdep_assert_irqs_disabled();
if (user || rcu_is_cpu_rrupt_from_idle()) {
rcu_note_voluntary_context_switch(current);
}
@@ -1631,8 +1632,8 @@ bool rcu_is_nocb_cpu(int cpu)
* Kick the GP kthread for this NOCB group. Caller holds ->nocb_lock
* and this function releases it.
*/
-static void wake_nocb_gp(struct rcu_data *rdp, bool force,
- unsigned long flags)
+static bool wake_nocb_gp(struct rcu_data *rdp, bool force,
+ unsigned long flags)
__releases(rdp->nocb_lock)
{
bool needwake = false;
@@ -1643,7 +1644,7 @@ static void wake_nocb_gp(struct rcu_data *rdp, bool force,
trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
TPS("AlreadyAwake"));
rcu_nocb_unlock_irqrestore(rdp, flags);
- return;
+ return false;
}
del_timer(&rdp->nocb_timer);
rcu_nocb_unlock_irqrestore(rdp, flags);
@@ -1656,6 +1657,8 @@ static void wake_nocb_gp(struct rcu_data *rdp, bool force,
raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags);
if (needwake)
wake_up_process(rdp_gp->nocb_gp_kthread);
+
+ return needwake;
}
/*
@@ -1665,6 +1668,8 @@ static void wake_nocb_gp(struct rcu_data *rdp, bool force,
static void wake_nocb_gp_defer(struct rcu_data *rdp, int waketype,
const char *reason)
{
+ if (rdp->nocb_defer_wakeup == RCU_NOCB_WAKE_OFF)
+ return;
if (rdp->nocb_defer_wakeup == RCU_NOCB_WAKE_NOT)
mod_timer(&rdp->nocb_timer, jiffies + 1);
if (rdp->nocb_defer_wakeup < waketype)
@@ -1929,6 +1934,52 @@ static void do_nocb_bypass_wakeup_timer(struct timer_list *t)
}
/*
+ * Check if we ignore this rdp.
+ *
+ * We check that without holding the nocb lock but
+ * we make sure not to miss a freshly offloaded rdp
+ * with the current ordering:
+ *
+ * rdp_offload_toggle() nocb_gp_enabled_cb()
+ * ------------------------- ----------------------------
+ * WRITE flags LOCK nocb_gp_lock
+ * LOCK nocb_gp_lock READ/WRITE nocb_gp_sleep
+ * READ/WRITE nocb_gp_sleep UNLOCK nocb_gp_lock
+ * UNLOCK nocb_gp_lock READ flags
+ */
+static inline bool nocb_gp_enabled_cb(struct rcu_data *rdp)
+{
+ u8 flags = SEGCBLIST_OFFLOADED | SEGCBLIST_KTHREAD_GP;
+
+ return rcu_segcblist_test_flags(&rdp->cblist, flags);
+}
+
+static inline bool nocb_gp_update_state(struct rcu_data *rdp, bool *needwake_state)
+{
+ struct rcu_segcblist *cblist = &rdp->cblist;
+
+ if (rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED)) {
+ if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)) {
+ rcu_segcblist_set_flags(cblist, SEGCBLIST_KTHREAD_GP);
+ if (rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB))
+ *needwake_state = true;
+ }
+ return true;
+ }
+
+ /*
+ * De-offloading. Clear our flag and notify the de-offload worker.
+ * We will ignore this rdp until it ever gets re-offloaded.
+ */
+ WARN_ON_ONCE(!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP));
+ rcu_segcblist_clear_flags(cblist, SEGCBLIST_KTHREAD_GP);
+ if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB))
+ *needwake_state = true;
+ return false;
+}
+
+
+/*
* No-CBs GP kthreads come here to wait for additional callbacks to show up
* or for grace periods to end.
*/
@@ -1956,8 +2007,18 @@ static void nocb_gp_wait(struct rcu_data *my_rdp)
*/
WARN_ON_ONCE(my_rdp->nocb_gp_rdp != my_rdp);
for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_cb_rdp) {
+ bool needwake_state = false;
+
+ if (!nocb_gp_enabled_cb(rdp))
+ continue;
trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Check"));
rcu_nocb_lock_irqsave(rdp, flags);
+ if (!nocb_gp_update_state(rdp, &needwake_state)) {
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ if (needwake_state)
+ swake_up_one(&rdp->nocb_state_wq);
+ continue;
+ }
bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
if (bypass_ncbs &&
(time_after(j, READ_ONCE(rdp->nocb_bypass_first) + 1) ||
@@ -1967,6 +2028,8 @@ static void nocb_gp_wait(struct rcu_data *my_rdp)
bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
} else if (!bypass_ncbs && rcu_segcblist_empty(&rdp->cblist)) {
rcu_nocb_unlock_irqrestore(rdp, flags);
+ if (needwake_state)
+ swake_up_one(&rdp->nocb_state_wq);
continue; /* No callbacks here, try next. */
}
if (bypass_ncbs) {
@@ -2018,6 +2081,8 @@ static void nocb_gp_wait(struct rcu_data *my_rdp)
}
if (needwake_gp)
rcu_gp_kthread_wake();
+ if (needwake_state)
+ swake_up_one(&rdp->nocb_state_wq);
}
my_rdp->nocb_gp_bypass = bypass;
@@ -2081,14 +2146,27 @@ static int rcu_nocb_gp_kthread(void *arg)
return 0;
}
+static inline bool nocb_cb_can_run(struct rcu_data *rdp)
+{
+ u8 flags = SEGCBLIST_OFFLOADED | SEGCBLIST_KTHREAD_CB;
+ return rcu_segcblist_test_flags(&rdp->cblist, flags);
+}
+
+static inline bool nocb_cb_wait_cond(struct rcu_data *rdp)
+{
+ return nocb_cb_can_run(rdp) && !READ_ONCE(rdp->nocb_cb_sleep);
+}
+
/*
* Invoke any ready callbacks from the corresponding no-CBs CPU,
* then, if there are no more, wait for more to appear.
*/
static void nocb_cb_wait(struct rcu_data *rdp)
{
+ struct rcu_segcblist *cblist = &rdp->cblist;
unsigned long cur_gp_seq;
unsigned long flags;
+ bool needwake_state = false;
bool needwake_gp = false;
struct rcu_node *rnp = rdp->mynode;
@@ -2100,32 +2178,55 @@ static void nocb_cb_wait(struct rcu_data *rdp)
local_bh_enable();
lockdep_assert_irqs_enabled();
rcu_nocb_lock_irqsave(rdp, flags);
- if (rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) &&
+ if (rcu_segcblist_nextgp(cblist, &cur_gp_seq) &&
rcu_seq_done(&rnp->gp_seq, cur_gp_seq) &&
raw_spin_trylock_rcu_node(rnp)) { /* irqs already disabled. */
needwake_gp = rcu_advance_cbs(rdp->mynode, rdp);
raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */
}
- if (rcu_segcblist_ready_cbs(&rdp->cblist)) {
- rcu_nocb_unlock_irqrestore(rdp, flags);
- if (needwake_gp)
- rcu_gp_kthread_wake();
- return;
- }
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("CBSleep"));
WRITE_ONCE(rdp->nocb_cb_sleep, true);
+
+ if (rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED)) {
+ if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB)) {
+ rcu_segcblist_set_flags(cblist, SEGCBLIST_KTHREAD_CB);
+ if (rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP))
+ needwake_state = true;
+ }
+ if (rcu_segcblist_ready_cbs(cblist))
+ WRITE_ONCE(rdp->nocb_cb_sleep, false);
+ } else {
+ /*
+ * De-offloading. Clear our flag and notify the de-offload worker.
+ * We won't touch the callbacks and keep sleeping until we ever
+ * get re-offloaded.
+ */
+ WARN_ON_ONCE(!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB));
+ rcu_segcblist_clear_flags(cblist, SEGCBLIST_KTHREAD_CB);
+ if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP))
+ needwake_state = true;
+ }
+
+ if (rdp->nocb_cb_sleep)
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("CBSleep"));
+
rcu_nocb_unlock_irqrestore(rdp, flags);
if (needwake_gp)
rcu_gp_kthread_wake();
- swait_event_interruptible_exclusive(rdp->nocb_cb_wq,
- !READ_ONCE(rdp->nocb_cb_sleep));
- if (!smp_load_acquire(&rdp->nocb_cb_sleep)) { /* VVV */
- /* ^^^ Ensure CB invocation follows _sleep test. */
- return;
- }
- WARN_ON(signal_pending(current));
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeEmpty"));
+
+ if (needwake_state)
+ swake_up_one(&rdp->nocb_state_wq);
+
+ do {
+ swait_event_interruptible_exclusive(rdp->nocb_cb_wq,
+ nocb_cb_wait_cond(rdp));
+
+ // VVV Ensure CB invocation follows _sleep test.
+ if (smp_load_acquire(&rdp->nocb_cb_sleep)) { // ^^^
+ WARN_ON(signal_pending(current));
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeEmpty"));
+ }
+ } while (!nocb_cb_can_run(rdp));
}
/*
@@ -2148,24 +2249,27 @@ static int rcu_nocb_cb_kthread(void *arg)
/* Is a deferred wakeup of rcu_nocb_kthread() required? */
static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp)
{
- return READ_ONCE(rdp->nocb_defer_wakeup);
+ return READ_ONCE(rdp->nocb_defer_wakeup) > RCU_NOCB_WAKE_NOT;
}
/* Do a deferred wakeup of rcu_nocb_kthread(). */
-static void do_nocb_deferred_wakeup_common(struct rcu_data *rdp)
+static bool do_nocb_deferred_wakeup_common(struct rcu_data *rdp)
{
unsigned long flags;
int ndw;
+ int ret;
rcu_nocb_lock_irqsave(rdp, flags);
if (!rcu_nocb_need_deferred_wakeup(rdp)) {
rcu_nocb_unlock_irqrestore(rdp, flags);
- return;
+ return false;
}
ndw = READ_ONCE(rdp->nocb_defer_wakeup);
WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT);
- wake_nocb_gp(rdp, ndw == RCU_NOCB_WAKE_FORCE, flags);
+ ret = wake_nocb_gp(rdp, ndw == RCU_NOCB_WAKE_FORCE, flags);
trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DeferredWake"));
+
+ return ret;
}
/* Do a deferred wakeup of rcu_nocb_kthread() from a timer handler. */
@@ -2181,12 +2285,208 @@ static void do_nocb_deferred_wakeup_timer(struct timer_list *t)
* This means we do an inexact common-case check. Note that if
* we miss, ->nocb_timer will eventually clean things up.
*/
-static void do_nocb_deferred_wakeup(struct rcu_data *rdp)
+static bool do_nocb_deferred_wakeup(struct rcu_data *rdp)
{
if (rcu_nocb_need_deferred_wakeup(rdp))
- do_nocb_deferred_wakeup_common(rdp);
+ return do_nocb_deferred_wakeup_common(rdp);
+ return false;
}
+void rcu_nocb_flush_deferred_wakeup(void)
+{
+ do_nocb_deferred_wakeup(this_cpu_ptr(&rcu_data));
+}
+EXPORT_SYMBOL_GPL(rcu_nocb_flush_deferred_wakeup);
+
+static int rdp_offload_toggle(struct rcu_data *rdp,
+ bool offload, unsigned long flags)
+ __releases(rdp->nocb_lock)
+{
+ struct rcu_segcblist *cblist = &rdp->cblist;
+ struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
+ bool wake_gp = false;
+
+ rcu_segcblist_offload(cblist, offload);
+
+ if (rdp->nocb_cb_sleep)
+ rdp->nocb_cb_sleep = false;
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+
+ /*
+ * Ignore former value of nocb_cb_sleep and force wake up as it could
+ * have been spuriously set to false already.
+ */
+ swake_up_one(&rdp->nocb_cb_wq);
+
+ raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags);
+ if (rdp_gp->nocb_gp_sleep) {
+ rdp_gp->nocb_gp_sleep = false;
+ wake_gp = true;
+ }
+ raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags);
+
+ if (wake_gp)
+ wake_up_process(rdp_gp->nocb_gp_kthread);
+
+ return 0;
+}
+
+static int __rcu_nocb_rdp_deoffload(struct rcu_data *rdp)
+{
+ struct rcu_segcblist *cblist = &rdp->cblist;
+ unsigned long flags;
+ int ret;
+
+ pr_info("De-offloading %d\n", rdp->cpu);
+
+ rcu_nocb_lock_irqsave(rdp, flags);
+ /*
+ * If there are still pending work offloaded, the offline
+ * CPU won't help much handling them.
+ */
+ if (cpu_is_offline(rdp->cpu) && !rcu_segcblist_empty(&rdp->cblist)) {
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ return -EBUSY;
+ }
+
+ ret = rdp_offload_toggle(rdp, false, flags);
+ swait_event_exclusive(rdp->nocb_state_wq,
+ !rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB |
+ SEGCBLIST_KTHREAD_GP));
+ rcu_nocb_lock_irqsave(rdp, flags);
+ /* Make sure nocb timer won't stay around */
+ WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_OFF);
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ del_timer_sync(&rdp->nocb_timer);
+
+ /*
+ * Flush bypass. While IRQs are disabled and once we set
+ * SEGCBLIST_SOFTIRQ_ONLY, no callback is supposed to be
+ * enqueued on bypass.
+ */
+ rcu_nocb_lock_irqsave(rdp, flags);
+ rcu_nocb_flush_bypass(rdp, NULL, jiffies);
+ rcu_segcblist_set_flags(cblist, SEGCBLIST_SOFTIRQ_ONLY);
+ /*
+ * With SEGCBLIST_SOFTIRQ_ONLY, we can't use
+ * rcu_nocb_unlock_irqrestore() anymore. Theoretically we
+ * could set SEGCBLIST_SOFTIRQ_ONLY with cb unlocked and IRQs
+ * disabled now, but let's be paranoid.
+ */
+ raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
+
+ return ret;
+}
+
+static long rcu_nocb_rdp_deoffload(void *arg)
+{
+ struct rcu_data *rdp = arg;
+
+ WARN_ON_ONCE(rdp->cpu != raw_smp_processor_id());
+ return __rcu_nocb_rdp_deoffload(rdp);
+}
+
+int rcu_nocb_cpu_deoffload(int cpu)
+{
+ struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+ int ret = 0;
+
+ if (rdp == rdp->nocb_gp_rdp) {
+ pr_info("Can't deoffload an rdp GP leader (yet)\n");
+ return -EINVAL;
+ }
+ mutex_lock(&rcu_state.barrier_mutex);
+ cpus_read_lock();
+ if (rcu_segcblist_is_offloaded(&rdp->cblist)) {
+ if (cpu_online(cpu))
+ ret = work_on_cpu(cpu, rcu_nocb_rdp_deoffload, rdp);
+ else
+ ret = __rcu_nocb_rdp_deoffload(rdp);
+ if (!ret)
+ cpumask_clear_cpu(cpu, rcu_nocb_mask);
+ }
+ cpus_read_unlock();
+ mutex_unlock(&rcu_state.barrier_mutex);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(rcu_nocb_cpu_deoffload);
+
+static int __rcu_nocb_rdp_offload(struct rcu_data *rdp)
+{
+ struct rcu_segcblist *cblist = &rdp->cblist;
+ unsigned long flags;
+ int ret;
+
+ /*
+ * For now we only support re-offload, ie: the rdp must have been
+ * offloaded on boot first.
+ */
+ if (!rdp->nocb_gp_rdp)
+ return -EINVAL;
+
+ pr_info("Offloading %d\n", rdp->cpu);
+ /*
+ * Can't use rcu_nocb_lock_irqsave() while we are in
+ * SEGCBLIST_SOFTIRQ_ONLY mode.
+ */
+ raw_spin_lock_irqsave(&rdp->nocb_lock, flags);
+ /* Re-enable nocb timer */
+ WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT);
+ /*
+ * We didn't take the nocb lock while working on the
+ * rdp->cblist in SEGCBLIST_SOFTIRQ_ONLY mode.
+ * Every modifications that have been done previously on
+ * rdp->cblist must be visible remotely by the nocb kthreads
+ * upon wake up after reading the cblist flags.
+ *
+ * The layout against nocb_lock enforces that ordering:
+ *
+ * __rcu_nocb_rdp_offload() nocb_cb_wait()/nocb_gp_wait()
+ * ------------------------- ----------------------------
+ * WRITE callbacks rcu_nocb_lock()
+ * rcu_nocb_lock() READ flags
+ * WRITE flags READ callbacks
+ * rcu_nocb_unlock() rcu_nocb_unlock()
+ */
+ ret = rdp_offload_toggle(rdp, true, flags);
+ swait_event_exclusive(rdp->nocb_state_wq,
+ rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB) &&
+ rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP));
+
+ return ret;
+}
+
+static long rcu_nocb_rdp_offload(void *arg)
+{
+ struct rcu_data *rdp = arg;
+
+ WARN_ON_ONCE(rdp->cpu != raw_smp_processor_id());
+ return __rcu_nocb_rdp_offload(rdp);
+}
+
+int rcu_nocb_cpu_offload(int cpu)
+{
+ struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+ int ret = 0;
+
+ mutex_lock(&rcu_state.barrier_mutex);
+ cpus_read_lock();
+ if (!rcu_segcblist_is_offloaded(&rdp->cblist)) {
+ if (cpu_online(cpu))
+ ret = work_on_cpu(cpu, rcu_nocb_rdp_offload, rdp);
+ else
+ ret = __rcu_nocb_rdp_offload(rdp);
+ if (!ret)
+ cpumask_set_cpu(cpu, rcu_nocb_mask);
+ }
+ cpus_read_unlock();
+ mutex_unlock(&rcu_state.barrier_mutex);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(rcu_nocb_cpu_offload);
+
void __init rcu_init_nohz(void)
{
int cpu;
@@ -2229,7 +2529,9 @@ void __init rcu_init_nohz(void)
rdp = per_cpu_ptr(&rcu_data, cpu);
if (rcu_segcblist_empty(&rdp->cblist))
rcu_segcblist_init(&rdp->cblist);
- rcu_segcblist_offload(&rdp->cblist);
+ rcu_segcblist_offload(&rdp->cblist, true);
+ rcu_segcblist_set_flags(&rdp->cblist, SEGCBLIST_KTHREAD_CB);
+ rcu_segcblist_set_flags(&rdp->cblist, SEGCBLIST_KTHREAD_GP);
}
rcu_organize_nocb_kthreads();
}
@@ -2239,6 +2541,7 @@ static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
{
init_swait_queue_head(&rdp->nocb_cb_wq);
init_swait_queue_head(&rdp->nocb_gp_wq);
+ init_swait_queue_head(&rdp->nocb_state_wq);
raw_spin_lock_init(&rdp->nocb_lock);
raw_spin_lock_init(&rdp->nocb_bypass_lock);
raw_spin_lock_init(&rdp->nocb_gp_lock);
@@ -2381,6 +2684,19 @@ void rcu_bind_current_to_nocb(void)
}
EXPORT_SYMBOL_GPL(rcu_bind_current_to_nocb);
+// The ->on_cpu field is available only in CONFIG_SMP=y, so...
+#ifdef CONFIG_SMP
+static char *show_rcu_should_be_on_cpu(struct task_struct *tsp)
+{
+ return tsp && tsp->state == TASK_RUNNING && !tsp->on_cpu ? "!" : "";
+}
+#else // #ifdef CONFIG_SMP
+static char *show_rcu_should_be_on_cpu(struct task_struct *tsp)
+{
+ return "";
+}
+#endif // #else #ifdef CONFIG_SMP
+
/*
* Dump out nocb grace-period kthread state for the specified rcu_data
* structure.
@@ -2389,7 +2705,7 @@ static void show_rcu_nocb_gp_state(struct rcu_data *rdp)
{
struct rcu_node *rnp = rdp->mynode;
- pr_info("nocb GP %d %c%c%c%c%c%c %c[%c%c] %c%c:%ld rnp %d:%d %lu\n",
+ pr_info("nocb GP %d %c%c%c%c%c%c %c[%c%c] %c%c:%ld rnp %d:%d %lu %c CPU %d%s\n",
rdp->cpu,
"kK"[!!rdp->nocb_gp_kthread],
"lL"[raw_spin_is_locked(&rdp->nocb_gp_lock)],
@@ -2403,12 +2719,17 @@ static void show_rcu_nocb_gp_state(struct rcu_data *rdp)
".B"[!!rdp->nocb_gp_bypass],
".G"[!!rdp->nocb_gp_gp],
(long)rdp->nocb_gp_seq,
- rnp->grplo, rnp->grphi, READ_ONCE(rdp->nocb_gp_loops));
+ rnp->grplo, rnp->grphi, READ_ONCE(rdp->nocb_gp_loops),
+ rdp->nocb_gp_kthread ? task_state_to_char(rdp->nocb_gp_kthread) : '.',
+ rdp->nocb_cb_kthread ? (int)task_cpu(rdp->nocb_gp_kthread) : -1,
+ show_rcu_should_be_on_cpu(rdp->nocb_cb_kthread));
}
/* Dump out nocb kthread state for the specified rcu_data structure. */
static void show_rcu_nocb_state(struct rcu_data *rdp)
{
+ char bufw[20];
+ char bufr[20];
struct rcu_segcblist *rsclp = &rdp->cblist;
bool waslocked;
bool wastimer;
@@ -2417,8 +2738,11 @@ static void show_rcu_nocb_state(struct rcu_data *rdp)
if (rdp->nocb_gp_rdp == rdp)
show_rcu_nocb_gp_state(rdp);
- pr_info(" CB %d->%d %c%c%c%c%c%c F%ld L%ld C%d %c%c%c%c%c q%ld\n",
+ sprintf(bufw, "%ld", rsclp->gp_seq[RCU_WAIT_TAIL]);
+ sprintf(bufr, "%ld", rsclp->gp_seq[RCU_NEXT_READY_TAIL]);
+ pr_info(" CB %d^%d->%d %c%c%c%c%c%c F%ld L%ld C%d %c%c%s%c%s%c%c q%ld %c CPU %d%s\n",
rdp->cpu, rdp->nocb_gp_rdp->cpu,
+ rdp->nocb_next_cb_rdp ? rdp->nocb_next_cb_rdp->cpu : -1,
"kK"[!!rdp->nocb_cb_kthread],
"bB"[raw_spin_is_locked(&rdp->nocb_bypass_lock)],
"cC"[!!atomic_read(&rdp->nocb_lock_contended)],
@@ -2429,11 +2753,16 @@ static void show_rcu_nocb_state(struct rcu_data *rdp)
jiffies - rdp->nocb_nobypass_last,
rdp->nocb_nobypass_count,
".D"[rcu_segcblist_ready_cbs(rsclp)],
- ".W"[!rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL)],
- ".R"[!rcu_segcblist_restempty(rsclp, RCU_WAIT_TAIL)],
- ".N"[!rcu_segcblist_restempty(rsclp, RCU_NEXT_READY_TAIL)],
+ ".W"[!rcu_segcblist_segempty(rsclp, RCU_WAIT_TAIL)],
+ rcu_segcblist_segempty(rsclp, RCU_WAIT_TAIL) ? "" : bufw,
+ ".R"[!rcu_segcblist_segempty(rsclp, RCU_NEXT_READY_TAIL)],
+ rcu_segcblist_segempty(rsclp, RCU_NEXT_READY_TAIL) ? "" : bufr,
+ ".N"[!rcu_segcblist_segempty(rsclp, RCU_NEXT_TAIL)],
".B"[!!rcu_cblist_n_cbs(&rdp->nocb_bypass)],
- rcu_segcblist_n_cbs(&rdp->cblist));
+ rcu_segcblist_n_cbs(&rdp->cblist),
+ rdp->nocb_cb_kthread ? task_state_to_char(rdp->nocb_cb_kthread) : '.',
+ rdp->nocb_cb_kthread ? (int)task_cpu(rdp->nocb_gp_kthread) : -1,
+ show_rcu_should_be_on_cpu(rdp->nocb_cb_kthread));
/* It is OK for GP kthreads to have GP state. */
if (rdp->nocb_gp_rdp == rdp)
@@ -2518,8 +2847,9 @@ static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp)
return false;
}
-static void do_nocb_deferred_wakeup(struct rcu_data *rdp)
+static bool do_nocb_deferred_wakeup(struct rcu_data *rdp)
{
+ return false;
}
static void rcu_spawn_cpu_nocb_kthread(int cpu)
diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index 70d48c52fabc9..475b26171b20f 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -266,6 +266,7 @@ static int rcu_print_task_stall(struct rcu_node *rnp, unsigned long flags)
struct task_struct *t;
struct task_struct *ts[8];
+ lockdep_assert_irqs_disabled();
if (!rcu_preempt_blocked_readers_cgp(rnp))
return 0;
pr_err("\tTasks blocked on level-%d rcu_node (CPUs %d-%d):",
@@ -290,6 +291,7 @@ static int rcu_print_task_stall(struct rcu_node *rnp, unsigned long flags)
".q"[rscr.rs.b.need_qs],
".e"[rscr.rs.b.exp_hint],
".l"[rscr.on_blkd_list]);
+ lockdep_assert_irqs_disabled();
put_task_struct(t);
ndetected++;
}
@@ -333,9 +335,12 @@ static void rcu_dump_cpu_stacks(void)
rcu_for_each_leaf_node(rnp) {
raw_spin_lock_irqsave_rcu_node(rnp, flags);
for_each_leaf_node_possible_cpu(rnp, cpu)
- if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu))
- if (!trigger_single_cpu_backtrace(cpu))
+ if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) {
+ if (cpu_is_offline(cpu))
+ pr_err("Offline CPU %d blocking current GP.\n", cpu);
+ else if (!trigger_single_cpu_backtrace(cpu))
dump_cpu_task(cpu);
+ }
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
}
}
@@ -449,25 +454,66 @@ static void print_cpu_stall_info(int cpu)
/* Complain about starvation of grace-period kthread. */
static void rcu_check_gp_kthread_starvation(void)
{
+ int cpu;
struct task_struct *gpk = rcu_state.gp_kthread;
unsigned long j;
if (rcu_is_gp_kthread_starving(&j)) {
+ cpu = gpk ? task_cpu(gpk) : -1;
pr_err("%s kthread starved for %ld jiffies! g%ld f%#x %s(%d) ->state=%#lx ->cpu=%d\n",
rcu_state.name, j,
(long)rcu_seq_current(&rcu_state.gp_seq),
data_race(rcu_state.gp_flags),
gp_state_getname(rcu_state.gp_state), rcu_state.gp_state,
- gpk ? gpk->state : ~0, gpk ? task_cpu(gpk) : -1);
+ gpk ? gpk->state : ~0, cpu);
if (gpk) {
pr_err("\tUnless %s kthread gets sufficient CPU time, OOM is now expected behavior.\n", rcu_state.name);
pr_err("RCU grace-period kthread stack dump:\n");
sched_show_task(gpk);
+ if (cpu >= 0) {
+ if (cpu_is_offline(cpu)) {
+ pr_err("RCU GP kthread last ran on offline CPU %d.\n", cpu);
+ } else {
+ pr_err("Stack dump where RCU GP kthread last ran:\n");
+ if (!trigger_single_cpu_backtrace(cpu))
+ dump_cpu_task(cpu);
+ }
+ }
wake_up_process(gpk);
}
}
}
+/* Complain about missing wakeups from expired fqs wait timer */
+static void rcu_check_gp_kthread_expired_fqs_timer(void)
+{
+ struct task_struct *gpk = rcu_state.gp_kthread;
+ short gp_state;
+ unsigned long jiffies_fqs;
+ int cpu;
+
+ /*
+ * Order reads of .gp_state and .jiffies_force_qs.
+ * Matching smp_wmb() is present in rcu_gp_fqs_loop().
+ */
+ gp_state = smp_load_acquire(&rcu_state.gp_state);
+ jiffies_fqs = READ_ONCE(rcu_state.jiffies_force_qs);
+
+ if (gp_state == RCU_GP_WAIT_FQS &&
+ time_after(jiffies, jiffies_fqs + RCU_STALL_MIGHT_MIN) &&
+ gpk && !READ_ONCE(gpk->on_rq)) {
+ cpu = task_cpu(gpk);
+ pr_err("%s kthread timer wakeup didn't happen for %ld jiffies! g%ld f%#x %s(%d) ->state=%#lx\n",
+ rcu_state.name, (jiffies - jiffies_fqs),
+ (long)rcu_seq_current(&rcu_state.gp_seq),
+ data_race(rcu_state.gp_flags),
+ gp_state_getname(RCU_GP_WAIT_FQS), RCU_GP_WAIT_FQS,
+ gpk->state);
+ pr_err("\tPossible timer handling issue on cpu=%d timer-softirq=%u\n",
+ cpu, kstat_softirqs_cpu(TIMER_SOFTIRQ, cpu));
+ }
+}
+
static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps)
{
int cpu;
@@ -478,6 +524,8 @@ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps)
struct rcu_node *rnp;
long totqlen = 0;
+ lockdep_assert_irqs_disabled();
+
/* Kick and suppress, if so configured. */
rcu_stall_kick_kthreads();
if (rcu_stall_is_suppressed())
@@ -499,6 +547,7 @@ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps)
}
}
ndetected += rcu_print_task_stall(rnp, flags); // Releases rnp->lock.
+ lockdep_assert_irqs_disabled();
}
for_each_possible_cpu(cpu)
@@ -529,6 +578,7 @@ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps)
WRITE_ONCE(rcu_state.jiffies_stall,
jiffies + 3 * rcu_jiffies_till_stall_check() + 3);
+ rcu_check_gp_kthread_expired_fqs_timer();
rcu_check_gp_kthread_starvation();
panic_on_rcu_stall();
@@ -544,6 +594,8 @@ static void print_cpu_stall(unsigned long gps)
struct rcu_node *rnp = rcu_get_root();
long totqlen = 0;
+ lockdep_assert_irqs_disabled();
+
/* Kick and suppress, if so configured. */
rcu_stall_kick_kthreads();
if (rcu_stall_is_suppressed())
@@ -564,6 +616,7 @@ static void print_cpu_stall(unsigned long gps)
jiffies - gps,
(long)rcu_seq_current(&rcu_state.gp_seq), totqlen);
+ rcu_check_gp_kthread_expired_fqs_timer();
rcu_check_gp_kthread_starvation();
rcu_dump_cpu_stacks();
@@ -598,6 +651,7 @@ static void check_cpu_stall(struct rcu_data *rdp)
unsigned long js;
struct rcu_node *rnp;
+ lockdep_assert_irqs_disabled();
if ((rcu_stall_is_suppressed() && !READ_ONCE(rcu_kick_kthreads)) ||
!rcu_gp_in_progress())
return;
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 39334d2d2b379..b95ae86c40a7d 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -56,8 +56,10 @@
#ifndef CONFIG_TINY_RCU
module_param(rcu_expedited, int, 0);
module_param(rcu_normal, int, 0);
-static int rcu_normal_after_boot;
+static int rcu_normal_after_boot = IS_ENABLED(CONFIG_PREEMPT_RT);
+#ifndef CONFIG_PREEMPT_RT
module_param(rcu_normal_after_boot, int, 0);
+#endif
#endif /* #ifndef CONFIG_TINY_RCU */
#ifdef CONFIG_DEBUG_LOCK_ALLOC
diff --git a/kernel/scftorture.c b/kernel/scftorture.c
index d55a9f8cda3d4..2377cbb324742 100644
--- a/kernel/scftorture.c
+++ b/kernel/scftorture.c
@@ -398,6 +398,7 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra
static int scftorture_invoker(void *arg)
{
int cpu;
+ int curcpu;
DEFINE_TORTURE_RANDOM(rand);
struct scf_statistics *scfp = (struct scf_statistics *)arg;
bool was_offline = false;
@@ -412,7 +413,10 @@ static int scftorture_invoker(void *arg)
VERBOSE_SCFTORTOUT("scftorture_invoker %d: Waiting for all SCF torturers from cpu %d", scfp->cpu, smp_processor_id());
// Make sure that the CPU is affinitized appropriately during testing.
- WARN_ON_ONCE(smp_processor_id() != scfp->cpu);
+ curcpu = smp_processor_id();
+ WARN_ONCE(curcpu != scfp->cpu % nr_cpu_ids,
+ "%s: Wanted CPU %d, running on %d, nr_cpu_ids = %d\n",
+ __func__, scfp->cpu, curcpu, nr_cpu_ids);
if (!atomic_dec_return(&n_started))
while (atomic_read_acquire(&n_started)) {
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ff74fca39ed21..7f5ffc8784110 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -355,8 +355,9 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer)
static void __hrtick_restart(struct rq *rq)
{
struct hrtimer *timer = &rq->hrtick_timer;
+ ktime_t time = rq->hrtick_time;
- hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD);
+ hrtimer_start(timer, time, HRTIMER_MODE_ABS_PINNED_HARD);
}
/*
@@ -380,7 +381,6 @@ static void __hrtick_start(void *arg)
void hrtick_start(struct rq *rq, u64 delay)
{
struct hrtimer *timer = &rq->hrtick_timer;
- ktime_t time;
s64 delta;
/*
@@ -388,9 +388,7 @@ void hrtick_start(struct rq *rq, u64 delay)
* doesn't make sense and can cause timer DoS.
*/
delta = max_t(s64, delay, 10000LL);
- time = ktime_add_ns(timer->base->get_time(), delta);
-
- hrtimer_set_expires(timer, time);
+ rq->hrtick_time = ktime_add_ns(timer->base->get_time(), delta);
if (rq == this_rq())
__hrtick_restart(rq);
@@ -3478,7 +3476,7 @@ out:
/**
* try_invoke_on_locked_down_task - Invoke a function on task in fixed state
- * @p: Process for which the function is to be invoked.
+ * @p: Process for which the function is to be invoked, can be @current.
* @func: Function to invoke.
* @arg: Argument to function.
*
@@ -3496,12 +3494,11 @@ out:
*/
bool try_invoke_on_locked_down_task(struct task_struct *p, bool (*func)(struct task_struct *t, void *arg), void *arg)
{
- bool ret = false;
struct rq_flags rf;
+ bool ret = false;
struct rq *rq;
- lockdep_assert_irqs_enabled();
- raw_spin_lock_irq(&p->pi_lock);
+ raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
if (p->on_rq) {
rq = __task_rq_lock(p, &rf);
if (task_rq(p) == rq)
@@ -3518,7 +3515,7 @@ bool try_invoke_on_locked_down_task(struct task_struct *p, bool (*func)(struct t
ret = func(p, arg);
}
}
- raw_spin_unlock_irq(&p->pi_lock);
+ raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags);
return ret;
}
@@ -4971,7 +4968,7 @@ static void __sched notrace __schedule(bool preempt)
schedule_debug(prev, preempt);
- if (sched_feat(HRTICK))
+ if (sched_feat(HRTICK) || sched_feat(HRTICK_DL))
hrtick_clear(rq);
local_irq_disable();
@@ -5265,6 +5262,12 @@ asmlinkage __visible void __sched notrace preempt_schedule(void)
NOKPROBE_SYMBOL(preempt_schedule);
EXPORT_SYMBOL(preempt_schedule);
+#ifdef CONFIG_PREEMPT_DYNAMIC
+DEFINE_STATIC_CALL(preempt_schedule, __preempt_schedule_func);
+EXPORT_STATIC_CALL_TRAMP(preempt_schedule);
+#endif
+
+
/**
* preempt_schedule_notrace - preempt_schedule called by tracing
*
@@ -5317,8 +5320,197 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
}
EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
+#ifdef CONFIG_PREEMPT_DYNAMIC
+DEFINE_STATIC_CALL(preempt_schedule_notrace, __preempt_schedule_notrace_func);
+EXPORT_STATIC_CALL_TRAMP(preempt_schedule_notrace);
+#endif
+
#endif /* CONFIG_PREEMPTION */
+#ifdef CONFIG_PREEMPT_DYNAMIC
+
+#include <linux/entry-common.h>
+
+/*
+ * SC:cond_resched
+ * SC:might_resched
+ * SC:preempt_schedule
+ * SC:preempt_schedule_notrace
+ * SC:irqentry_exit_cond_resched
+ *
+ *
+ * NONE:
+ * cond_resched <- __cond_resched
+ * might_resched <- RET0
+ * preempt_schedule <- NOP
+ * preempt_schedule_notrace <- NOP
+ * irqentry_exit_cond_resched <- NOP
+ *
+ * VOLUNTARY:
+ * cond_resched <- __cond_resched
+ * might_resched <- __cond_resched
+ * preempt_schedule <- NOP
+ * preempt_schedule_notrace <- NOP
+ * irqentry_exit_cond_resched <- NOP
+ *
+ * FULL:
+ * cond_resched <- RET0
+ * might_resched <- RET0
+ * preempt_schedule <- preempt_schedule
+ * preempt_schedule_notrace <- preempt_schedule_notrace
+ * irqentry_exit_cond_resched <- irqentry_exit_cond_resched
+ */
+
+enum {
+ preempt_dynamic_none = 0,
+ preempt_dynamic_voluntary,
+ preempt_dynamic_full,
+};
+
+static int preempt_dynamic_mode = preempt_dynamic_full;
+
+static int sched_dynamic_mode(const char *str)
+{
+ if (!strcmp(str, "none"))
+ return 0;
+
+ if (!strcmp(str, "voluntary"))
+ return 1;
+
+ if (!strcmp(str, "full"))
+ return 2;
+
+ return -1;
+}
+
+static void sched_dynamic_update(int mode)
+{
+ /*
+ * Avoid {NONE,VOLUNTARY} -> FULL transitions from ever ending up in
+ * the ZERO state, which is invalid.
+ */
+ static_call_update(cond_resched, __cond_resched);
+ static_call_update(might_resched, __cond_resched);
+ static_call_update(preempt_schedule, __preempt_schedule_func);
+ static_call_update(preempt_schedule_notrace, __preempt_schedule_notrace_func);
+ static_call_update(irqentry_exit_cond_resched, irqentry_exit_cond_resched);
+
+ switch (mode) {
+ case preempt_dynamic_none:
+ static_call_update(cond_resched, __cond_resched);
+ static_call_update(might_resched, (typeof(&__cond_resched)) __static_call_return0);
+ static_call_update(preempt_schedule, (typeof(&preempt_schedule)) NULL);
+ static_call_update(preempt_schedule_notrace, (typeof(&preempt_schedule_notrace)) NULL);
+ static_call_update(irqentry_exit_cond_resched, (typeof(&irqentry_exit_cond_resched)) NULL);
+ pr_info("Dynamic Preempt: none\n");
+ break;
+
+ case preempt_dynamic_voluntary:
+ static_call_update(cond_resched, __cond_resched);
+ static_call_update(might_resched, __cond_resched);
+ static_call_update(preempt_schedule, (typeof(&preempt_schedule)) NULL);
+ static_call_update(preempt_schedule_notrace, (typeof(&preempt_schedule_notrace)) NULL);
+ static_call_update(irqentry_exit_cond_resched, (typeof(&irqentry_exit_cond_resched)) NULL);
+ pr_info("Dynamic Preempt: voluntary\n");
+ break;
+
+ case preempt_dynamic_full:
+ static_call_update(cond_resched, (typeof(&__cond_resched)) __static_call_return0);
+ static_call_update(might_resched, (typeof(&__cond_resched)) __static_call_return0);
+ static_call_update(preempt_schedule, __preempt_schedule_func);
+ static_call_update(preempt_schedule_notrace, __preempt_schedule_notrace_func);
+ static_call_update(irqentry_exit_cond_resched, irqentry_exit_cond_resched);
+ pr_info("Dynamic Preempt: full\n");
+ break;
+ }
+
+ preempt_dynamic_mode = mode;
+}
+
+static int __init setup_preempt_mode(char *str)
+{
+ int mode = sched_dynamic_mode(str);
+ if (mode < 0) {
+ pr_warn("Dynamic Preempt: unsupported mode: %s\n", str);
+ return 1;
+ }
+
+ sched_dynamic_update(mode);
+ return 0;
+}
+__setup("preempt=", setup_preempt_mode);
+
+#ifdef CONFIG_SCHED_DEBUG
+
+static ssize_t sched_dynamic_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ char buf[16];
+ int mode;
+
+ if (cnt > 15)
+ cnt = 15;
+
+ if (copy_from_user(&buf, ubuf, cnt))
+ return -EFAULT;
+
+ buf[cnt] = 0;
+ mode = sched_dynamic_mode(strstrip(buf));
+ if (mode < 0)
+ return mode;
+
+ sched_dynamic_update(mode);
+
+ *ppos += cnt;
+
+ return cnt;
+}
+
+static int sched_dynamic_show(struct seq_file *m, void *v)
+{
+ static const char * preempt_modes[] = {
+ "none", "voluntary", "full"
+ };
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(preempt_modes); i++) {
+ if (preempt_dynamic_mode == i)
+ seq_puts(m, "(");
+ seq_puts(m, preempt_modes[i]);
+ if (preempt_dynamic_mode == i)
+ seq_puts(m, ")");
+
+ seq_puts(m, " ");
+ }
+
+ seq_puts(m, "\n");
+ return 0;
+}
+
+static int sched_dynamic_open(struct inode *inode, struct file *filp)
+{
+ return single_open(filp, sched_dynamic_show, NULL);
+}
+
+static const struct file_operations sched_dynamic_fops = {
+ .open = sched_dynamic_open,
+ .write = sched_dynamic_write,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static __init int sched_init_debug_dynamic(void)
+{
+ debugfs_create_file("sched_preempt", 0644, NULL, NULL, &sched_dynamic_fops);
+ return 0;
+}
+late_initcall(sched_init_debug_dynamic);
+
+#endif /* CONFIG_SCHED_DEBUG */
+#endif /* CONFIG_PREEMPT_DYNAMIC */
+
+
/*
* This is the entry point to schedule() from kernel preemption
* off of irq context.
@@ -5616,8 +5808,12 @@ SYSCALL_DEFINE1(nice, int, increment)
* @p: the task in question.
*
* Return: The priority value as seen by users in /proc.
- * RT tasks are offset by -200. Normal tasks are centered
- * around 0, value goes from -16 to +15.
+ *
+ * sched policy return value kernel prio user prio/nice
+ *
+ * normal, batch, idle [0 ... 39] [100 ... 139] 0/[-20 ... 19]
+ * fifo, rr [-2 ... -100] [98 ... 0] [1 ... 99]
+ * deadline -101 -1 0
*/
int task_prio(const struct task_struct *p)
{
@@ -5676,6 +5872,120 @@ struct task_struct *idle_task(int cpu)
return cpu_rq(cpu)->idle;
}
+#ifdef CONFIG_SMP
+/*
+ * This function computes an effective utilization for the given CPU, to be
+ * used for frequency selection given the linear relation: f = u * f_max.
+ *
+ * The scheduler tracks the following metrics:
+ *
+ * cpu_util_{cfs,rt,dl,irq}()
+ * cpu_bw_dl()
+ *
+ * Where the cfs,rt and dl util numbers are tracked with the same metric and
+ * synchronized windows and are thus directly comparable.
+ *
+ * The cfs,rt,dl utilization are the running times measured with rq->clock_task
+ * which excludes things like IRQ and steal-time. These latter are then accrued
+ * in the irq utilization.
+ *
+ * The DL bandwidth number otoh is not a measured metric but a value computed
+ * based on the task model parameters and gives the minimal utilization
+ * required to meet deadlines.
+ */
+unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
+ unsigned long max, enum cpu_util_type type,
+ struct task_struct *p)
+{
+ unsigned long dl_util, util, irq;
+ struct rq *rq = cpu_rq(cpu);
+
+ if (!uclamp_is_used() &&
+ type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt)) {
+ return max;
+ }
+
+ /*
+ * Early check to see if IRQ/steal time saturates the CPU, can be
+ * because of inaccuracies in how we track these -- see
+ * update_irq_load_avg().
+ */
+ irq = cpu_util_irq(rq);
+ if (unlikely(irq >= max))
+ return max;
+
+ /*
+ * Because the time spend on RT/DL tasks is visible as 'lost' time to
+ * CFS tasks and we use the same metric to track the effective
+ * utilization (PELT windows are synchronized) we can directly add them
+ * to obtain the CPU's actual utilization.
+ *
+ * CFS and RT utilization can be boosted or capped, depending on
+ * utilization clamp constraints requested by currently RUNNABLE
+ * tasks.
+ * When there are no CFS RUNNABLE tasks, clamps are released and
+ * frequency will be gracefully reduced with the utilization decay.
+ */
+ util = util_cfs + cpu_util_rt(rq);
+ if (type == FREQUENCY_UTIL)
+ util = uclamp_rq_util_with(rq, util, p);
+
+ dl_util = cpu_util_dl(rq);
+
+ /*
+ * For frequency selection we do not make cpu_util_dl() a permanent part
+ * of this sum because we want to use cpu_bw_dl() later on, but we need
+ * to check if the CFS+RT+DL sum is saturated (ie. no idle time) such
+ * that we select f_max when there is no idle time.
+ *
+ * NOTE: numerical errors or stop class might cause us to not quite hit
+ * saturation when we should -- something for later.
+ */
+ if (util + dl_util >= max)
+ return max;
+
+ /*
+ * OTOH, for energy computation we need the estimated running time, so
+ * include util_dl and ignore dl_bw.
+ */
+ if (type == ENERGY_UTIL)
+ util += dl_util;
+
+ /*
+ * There is still idle time; further improve the number by using the
+ * irq metric. Because IRQ/steal time is hidden from the task clock we
+ * need to scale the task numbers:
+ *
+ * max - irq
+ * U' = irq + --------- * U
+ * max
+ */
+ util = scale_irq_capacity(util, irq, max);
+ util += irq;
+
+ /*
+ * Bandwidth required by DEADLINE must always be granted while, for
+ * FAIR and RT, we use blocked utilization of IDLE CPUs as a mechanism
+ * to gracefully reduce the frequency when no tasks show up for longer
+ * periods of time.
+ *
+ * Ideally we would like to set bw_dl as min/guaranteed freq and util +
+ * bw_dl as requested freq. However, cpufreq is not yet ready for such
+ * an interface. So, we only do the latter for now.
+ */
+ if (type == FREQUENCY_UTIL)
+ util += cpu_bw_dl(rq);
+
+ return min(max, util);
+}
+
+unsigned long sched_cpu_util(int cpu, unsigned long max)
+{
+ return effective_cpu_util(cpu, cpu_util_cfs(cpu_rq(cpu)), max,
+ ENERGY_UTIL, NULL);
+}
+#endif /* CONFIG_SMP */
+
/**
* find_process_by_pid - find a process with a matching PID value.
* @pid: the pid in question.
@@ -5797,11 +6107,10 @@ recheck:
/*
* Valid priorities for SCHED_FIFO and SCHED_RR are
- * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
+ * 1..MAX_RT_PRIO-1, valid priority for SCHED_NORMAL,
* SCHED_BATCH and SCHED_IDLE is 0.
*/
- if ((p->mm && attr->sched_priority > MAX_USER_RT_PRIO-1) ||
- (!p->mm && attr->sched_priority > MAX_RT_PRIO-1))
+ if (attr->sched_priority > MAX_RT_PRIO-1)
return -EINVAL;
if ((dl_policy(policy) && !__checkparam_dl(attr)) ||
(rt_policy(policy) != (attr->sched_priority != 0)))
@@ -6668,17 +6977,27 @@ SYSCALL_DEFINE0(sched_yield)
return 0;
}
-#ifndef CONFIG_PREEMPTION
-int __sched _cond_resched(void)
+#if !defined(CONFIG_PREEMPTION) || defined(CONFIG_PREEMPT_DYNAMIC)
+int __sched __cond_resched(void)
{
if (should_resched(0)) {
preempt_schedule_common();
return 1;
}
+#ifndef CONFIG_PREEMPT_RCU
rcu_all_qs();
+#endif
return 0;
}
-EXPORT_SYMBOL(_cond_resched);
+EXPORT_SYMBOL(__cond_resched);
+#endif
+
+#ifdef CONFIG_PREEMPT_DYNAMIC
+DEFINE_STATIC_CALL_RET0(cond_resched, __cond_resched);
+EXPORT_STATIC_CALL_TRAMP(cond_resched);
+
+DEFINE_STATIC_CALL_RET0(might_resched, __cond_resched);
+EXPORT_STATIC_CALL_TRAMP(might_resched);
#endif
/*
@@ -6869,7 +7188,7 @@ SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
switch (policy) {
case SCHED_FIFO:
case SCHED_RR:
- ret = MAX_USER_RT_PRIO-1;
+ ret = MAX_RT_PRIO-1;
break;
case SCHED_DEADLINE:
case SCHED_NORMAL:
@@ -7509,6 +7828,12 @@ int sched_cpu_deactivate(unsigned int cpu)
struct rq_flags rf;
int ret;
+ /*
+ * Remove CPU from nohz.idle_cpus_mask to prevent participating in
+ * load balancing when not active
+ */
+ nohz_balance_exit_idle(rq);
+
set_cpu_active(cpu, false);
/*
@@ -7653,7 +7978,6 @@ int sched_cpu_dying(unsigned int cpu)
calc_load_migrate(rq);
update_max_interval();
- nohz_balance_exit_idle(rq);
hrtick_clear(rq);
return 0;
}
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 6931f0cdeb802..41e498b0008a6 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -171,112 +171,6 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy,
return cpufreq_driver_resolve_freq(policy, freq);
}
-/*
- * This function computes an effective utilization for the given CPU, to be
- * used for frequency selection given the linear relation: f = u * f_max.
- *
- * The scheduler tracks the following metrics:
- *
- * cpu_util_{cfs,rt,dl,irq}()
- * cpu_bw_dl()
- *
- * Where the cfs,rt and dl util numbers are tracked with the same metric and
- * synchronized windows and are thus directly comparable.
- *
- * The cfs,rt,dl utilization are the running times measured with rq->clock_task
- * which excludes things like IRQ and steal-time. These latter are then accrued
- * in the irq utilization.
- *
- * The DL bandwidth number otoh is not a measured metric but a value computed
- * based on the task model parameters and gives the minimal utilization
- * required to meet deadlines.
- */
-unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs,
- unsigned long max, enum schedutil_type type,
- struct task_struct *p)
-{
- unsigned long dl_util, util, irq;
- struct rq *rq = cpu_rq(cpu);
-
- if (!uclamp_is_used() &&
- type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt)) {
- return max;
- }
-
- /*
- * Early check to see if IRQ/steal time saturates the CPU, can be
- * because of inaccuracies in how we track these -- see
- * update_irq_load_avg().
- */
- irq = cpu_util_irq(rq);
- if (unlikely(irq >= max))
- return max;
-
- /*
- * Because the time spend on RT/DL tasks is visible as 'lost' time to
- * CFS tasks and we use the same metric to track the effective
- * utilization (PELT windows are synchronized) we can directly add them
- * to obtain the CPU's actual utilization.
- *
- * CFS and RT utilization can be boosted or capped, depending on
- * utilization clamp constraints requested by currently RUNNABLE
- * tasks.
- * When there are no CFS RUNNABLE tasks, clamps are released and
- * frequency will be gracefully reduced with the utilization decay.
- */
- util = util_cfs + cpu_util_rt(rq);
- if (type == FREQUENCY_UTIL)
- util = uclamp_rq_util_with(rq, util, p);
-
- dl_util = cpu_util_dl(rq);
-
- /*
- * For frequency selection we do not make cpu_util_dl() a permanent part
- * of this sum because we want to use cpu_bw_dl() later on, but we need
- * to check if the CFS+RT+DL sum is saturated (ie. no idle time) such
- * that we select f_max when there is no idle time.
- *
- * NOTE: numerical errors or stop class might cause us to not quite hit
- * saturation when we should -- something for later.
- */
- if (util + dl_util >= max)
- return max;
-
- /*
- * OTOH, for energy computation we need the estimated running time, so
- * include util_dl and ignore dl_bw.
- */
- if (type == ENERGY_UTIL)
- util += dl_util;
-
- /*
- * There is still idle time; further improve the number by using the
- * irq metric. Because IRQ/steal time is hidden from the task clock we
- * need to scale the task numbers:
- *
- * max - irq
- * U' = irq + --------- * U
- * max
- */
- util = scale_irq_capacity(util, irq, max);
- util += irq;
-
- /*
- * Bandwidth required by DEADLINE must always be granted while, for
- * FAIR and RT, we use blocked utilization of IDLE CPUs as a mechanism
- * to gracefully reduce the frequency when no tasks show up for longer
- * periods of time.
- *
- * Ideally we would like to set bw_dl as min/guaranteed freq and util +
- * bw_dl as requested freq. However, cpufreq is not yet ready for such
- * an interface. So, we only do the latter for now.
- */
- if (type == FREQUENCY_UTIL)
- util += cpu_bw_dl(rq);
-
- return min(max, util);
-}
-
static void sugov_get_util(struct sugov_cpu *sg_cpu)
{
struct rq *rq = cpu_rq(sg_cpu->cpu);
@@ -284,7 +178,7 @@ static void sugov_get_util(struct sugov_cpu *sg_cpu)
sg_cpu->max = max;
sg_cpu->bw_dl = cpu_bw_dl(rq);
- sg_cpu->util = schedutil_cpu_util(sg_cpu->cpu, cpu_util_cfs(rq), max,
+ sg_cpu->util = effective_cpu_util(sg_cpu->cpu, cpu_util_cfs(rq), max,
FREQUENCY_UTIL, NULL);
}
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 75686c6d4436d..aac3539aa0fee 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -517,58 +517,44 @@ static void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
update_dl_migration(dl_rq);
}
+#define __node_2_pdl(node) \
+ rb_entry((node), struct task_struct, pushable_dl_tasks)
+
+static inline bool __pushable_less(struct rb_node *a, const struct rb_node *b)
+{
+ return dl_entity_preempt(&__node_2_pdl(a)->dl, &__node_2_pdl(b)->dl);
+}
+
/*
* The list of pushable -deadline task is not a plist, like in
* sched_rt.c, it is an rb-tree with tasks ordered by deadline.
*/
static void enqueue_pushable_dl_task(struct rq *rq, struct task_struct *p)
{
- struct dl_rq *dl_rq = &rq->dl;
- struct rb_node **link = &dl_rq->pushable_dl_tasks_root.rb_root.rb_node;
- struct rb_node *parent = NULL;
- struct task_struct *entry;
- bool leftmost = true;
+ struct rb_node *leftmost;
BUG_ON(!RB_EMPTY_NODE(&p->pushable_dl_tasks));
- while (*link) {
- parent = *link;
- entry = rb_entry(parent, struct task_struct,
- pushable_dl_tasks);
- if (dl_entity_preempt(&p->dl, &entry->dl))
- link = &parent->rb_left;
- else {
- link = &parent->rb_right;
- leftmost = false;
- }
- }
-
+ leftmost = rb_add_cached(&p->pushable_dl_tasks,
+ &rq->dl.pushable_dl_tasks_root,
+ __pushable_less);
if (leftmost)
- dl_rq->earliest_dl.next = p->dl.deadline;
-
- rb_link_node(&p->pushable_dl_tasks, parent, link);
- rb_insert_color_cached(&p->pushable_dl_tasks,
- &dl_rq->pushable_dl_tasks_root, leftmost);
+ rq->dl.earliest_dl.next = p->dl.deadline;
}
static void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p)
{
struct dl_rq *dl_rq = &rq->dl;
+ struct rb_root_cached *root = &dl_rq->pushable_dl_tasks_root;
+ struct rb_node *leftmost;
if (RB_EMPTY_NODE(&p->pushable_dl_tasks))
return;
- if (dl_rq->pushable_dl_tasks_root.rb_leftmost == &p->pushable_dl_tasks) {
- struct rb_node *next_node;
-
- next_node = rb_next(&p->pushable_dl_tasks);
- if (next_node) {
- dl_rq->earliest_dl.next = rb_entry(next_node,
- struct task_struct, pushable_dl_tasks)->dl.deadline;
- }
- }
+ leftmost = rb_erase_cached(&p->pushable_dl_tasks, root);
+ if (leftmost)
+ dl_rq->earliest_dl.next = __node_2_pdl(leftmost)->dl.deadline;
- rb_erase_cached(&p->pushable_dl_tasks, &dl_rq->pushable_dl_tasks_root);
RB_CLEAR_NODE(&p->pushable_dl_tasks);
}
@@ -1478,29 +1464,21 @@ void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
dec_dl_migration(dl_se, dl_rq);
}
+#define __node_2_dle(node) \
+ rb_entry((node), struct sched_dl_entity, rb_node)
+
+static inline bool __dl_less(struct rb_node *a, const struct rb_node *b)
+{
+ return dl_time_before(__node_2_dle(a)->deadline, __node_2_dle(b)->deadline);
+}
+
static void __enqueue_dl_entity(struct sched_dl_entity *dl_se)
{
struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
- struct rb_node **link = &dl_rq->root.rb_root.rb_node;
- struct rb_node *parent = NULL;
- struct sched_dl_entity *entry;
- int leftmost = 1;
BUG_ON(!RB_EMPTY_NODE(&dl_se->rb_node));
- while (*link) {
- parent = *link;
- entry = rb_entry(parent, struct sched_dl_entity, rb_node);
- if (dl_time_before(dl_se->deadline, entry->deadline))
- link = &parent->rb_left;
- else {
- link = &parent->rb_right;
- leftmost = 0;
- }
- }
-
- rb_link_node(&dl_se->rb_node, parent, link);
- rb_insert_color_cached(&dl_se->rb_node, &dl_rq->root, leftmost);
+ rb_add_cached(&dl_se->rb_node, &dl_rq->root, __dl_less);
inc_dl_tasks(dl_se, dl_rq);
}
@@ -1513,6 +1491,7 @@ static void __dequeue_dl_entity(struct sched_dl_entity *dl_se)
return;
rb_erase_cached(&dl_se->rb_node, &dl_rq->root);
+
RB_CLEAR_NODE(&dl_se->rb_node);
dec_dl_tasks(dl_se, dl_rq);
@@ -1853,7 +1832,7 @@ static void set_next_task_dl(struct rq *rq, struct task_struct *p, bool first)
if (!first)
return;
- if (hrtick_enabled(rq))
+ if (hrtick_enabled_dl(rq))
start_hrtick_dl(rq, p);
if (rq->curr->sched_class != &dl_sched_class)
@@ -1916,7 +1895,7 @@ static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued)
* not being the leftmost task anymore. In that case NEED_RESCHED will
* be set and schedule() will start a new hrtick for the next task.
*/
- if (hrtick_enabled(rq) && queued && p->dl.runtime > 0 &&
+ if (hrtick_enabled_dl(rq) && queued && p->dl.runtime > 0 &&
is_leftmost(p, &rq->dl))
start_hrtick_dl(rq, p);
}
@@ -2409,9 +2388,13 @@ void dl_add_task_root_domain(struct task_struct *p)
struct rq *rq;
struct dl_bw *dl_b;
- rq = task_rq_lock(p, &rf);
- if (!dl_task(p))
- goto unlock;
+ raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
+ if (!dl_task(p)) {
+ raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags);
+ return;
+ }
+
+ rq = __task_rq_lock(p, &rf);
dl_b = &rq->rd->dl_bw;
raw_spin_lock(&dl_b->lock);
@@ -2420,7 +2403,6 @@ void dl_add_task_root_domain(struct task_struct *p)
raw_spin_unlock(&dl_b->lock);
-unlock:
task_rq_unlock(rq, p, &rf);
}
@@ -2514,7 +2496,7 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
static void prio_changed_dl(struct rq *rq, struct task_struct *p,
int oldprio)
{
- if (task_on_rq_queued(p) || rq->curr == p) {
+ if (task_on_rq_queued(p) || task_current(rq, p)) {
#ifdef CONFIG_SMP
/*
* This might be too much, but unfortunately
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 2357921580f9c..486f403a778b2 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -486,7 +486,7 @@ static char *task_group_path(struct task_group *tg)
static void
print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
{
- if (rq->curr == p)
+ if (task_current(rq, p))
SEQ_printf(m, ">R");
else
SEQ_printf(m, " %c", task_state_to_char(p));
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 04a3ce20da671..8a8bd7b13634d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -531,12 +531,15 @@ static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
return min_vruntime;
}
-static inline int entity_before(struct sched_entity *a,
+static inline bool entity_before(struct sched_entity *a,
struct sched_entity *b)
{
return (s64)(a->vruntime - b->vruntime) < 0;
}
+#define __node_2_se(node) \
+ rb_entry((node), struct sched_entity, run_node)
+
static void update_min_vruntime(struct cfs_rq *cfs_rq)
{
struct sched_entity *curr = cfs_rq->curr;
@@ -552,8 +555,7 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq)
}
if (leftmost) { /* non-empty tree */
- struct sched_entity *se;
- se = rb_entry(leftmost, struct sched_entity, run_node);
+ struct sched_entity *se = __node_2_se(leftmost);
if (!curr)
vruntime = se->vruntime;
@@ -569,37 +571,17 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq)
#endif
}
+static inline bool __entity_less(struct rb_node *a, const struct rb_node *b)
+{
+ return entity_before(__node_2_se(a), __node_2_se(b));
+}
+
/*
* Enqueue an entity into the rb-tree:
*/
static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- struct rb_node **link = &cfs_rq->tasks_timeline.rb_root.rb_node;
- struct rb_node *parent = NULL;
- struct sched_entity *entry;
- bool leftmost = true;
-
- /*
- * Find the right place in the rbtree:
- */
- while (*link) {
- parent = *link;
- entry = rb_entry(parent, struct sched_entity, run_node);
- /*
- * We dont care about collisions. Nodes with
- * the same key stay together.
- */
- if (entity_before(se, entry)) {
- link = &parent->rb_left;
- } else {
- link = &parent->rb_right;
- leftmost = false;
- }
- }
-
- rb_link_node(&se->run_node, parent, link);
- rb_insert_color_cached(&se->run_node,
- &cfs_rq->tasks_timeline, leftmost);
+ rb_add_cached(&se->run_node, &cfs_rq->tasks_timeline, __entity_less);
}
static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -614,7 +596,7 @@ struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
if (!left)
return NULL;
- return rb_entry(left, struct sched_entity, run_node);
+ return __node_2_se(left);
}
static struct sched_entity *__pick_next_entity(struct sched_entity *se)
@@ -624,7 +606,7 @@ static struct sched_entity *__pick_next_entity(struct sched_entity *se)
if (!next)
return NULL;
- return rb_entry(next, struct sched_entity, run_node);
+ return __node_2_se(next);
}
#ifdef CONFIG_SCHED_DEBUG
@@ -635,7 +617,7 @@ struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
if (!last)
return NULL;
- return rb_entry(last, struct sched_entity, run_node);
+ return __node_2_se(last);
}
/**************************************************************
@@ -3943,6 +3925,22 @@ static inline void util_est_enqueue(struct cfs_rq *cfs_rq,
trace_sched_util_est_cfs_tp(cfs_rq);
}
+static inline void util_est_dequeue(struct cfs_rq *cfs_rq,
+ struct task_struct *p)
+{
+ unsigned int enqueued;
+
+ if (!sched_feat(UTIL_EST))
+ return;
+
+ /* Update root cfs_rq's estimated utilization */
+ enqueued = cfs_rq->avg.util_est.enqueued;
+ enqueued -= min_t(unsigned int, enqueued, _task_util_est(p));
+ WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued);
+
+ trace_sched_util_est_cfs_tp(cfs_rq);
+}
+
/*
* Check if a (signed) value is within a specified (unsigned) margin,
* based on the observation that:
@@ -3956,23 +3954,16 @@ static inline bool within_margin(int value, int margin)
return ((unsigned int)(value + margin - 1) < (2 * margin - 1));
}
-static void
-util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
+static inline void util_est_update(struct cfs_rq *cfs_rq,
+ struct task_struct *p,
+ bool task_sleep)
{
long last_ewma_diff;
struct util_est ue;
- int cpu;
if (!sched_feat(UTIL_EST))
return;
- /* Update root cfs_rq's estimated utilization */
- ue.enqueued = cfs_rq->avg.util_est.enqueued;
- ue.enqueued -= min_t(unsigned int, ue.enqueued, _task_util_est(p));
- WRITE_ONCE(cfs_rq->avg.util_est.enqueued, ue.enqueued);
-
- trace_sched_util_est_cfs_tp(cfs_rq);
-
/*
* Skip update of task's estimated utilization when the task has not
* yet completed an activation, e.g. being migrated.
@@ -4012,8 +4003,7 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
* To avoid overestimation of actual task utilization, skip updates if
* we cannot grant there is idle time in this CPU.
*/
- cpu = cpu_of(rq_of(cfs_rq));
- if (task_util(p) > capacity_orig_of(cpu))
+ if (task_util(p) > capacity_orig_of(cpu_of(rq_of(cfs_rq))))
return;
/*
@@ -4052,7 +4042,7 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
if (!static_branch_unlikely(&sched_asym_cpucapacity))
return;
- if (!p) {
+ if (!p || p->nr_cpus_allowed == 1) {
rq->misfit_task_load = 0;
return;
}
@@ -4096,8 +4086,11 @@ static inline void
util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) {}
static inline void
-util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p,
- bool task_sleep) {}
+util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p) {}
+
+static inline void
+util_est_update(struct cfs_rq *cfs_rq, struct task_struct *p,
+ bool task_sleep) {}
static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {}
#endif /* CONFIG_SMP */
@@ -5419,7 +5412,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
s64 delta = slice - ran;
if (delta < 0) {
- if (rq->curr == p)
+ if (task_current(rq, p))
resched_curr(rq);
return;
}
@@ -5436,7 +5429,7 @@ static void hrtick_update(struct rq *rq)
{
struct task_struct *curr = rq->curr;
- if (!hrtick_enabled(rq) || curr->sched_class != &fair_sched_class)
+ if (!hrtick_enabled_fair(rq) || curr->sched_class != &fair_sched_class)
return;
if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
@@ -5609,6 +5602,8 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
int idle_h_nr_running = task_has_idle_policy(p);
bool was_sched_idle = sched_idle_rq(rq);
+ util_est_dequeue(&rq->cfs, p);
+
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
dequeue_entity(cfs_rq, se, flags);
@@ -5659,7 +5654,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
rq->next_balance = jiffies;
dequeue_throttle:
- util_est_dequeue(&rq->cfs, p, task_sleep);
+ util_est_update(&rq->cfs, p, task_sleep);
hrtick_update(rq);
}
@@ -6006,6 +6001,14 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p
return new_cpu;
}
+static inline int __select_idle_cpu(int cpu)
+{
+ if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
+ return cpu;
+
+ return -1;
+}
+
#ifdef CONFIG_SCHED_SMT
DEFINE_STATIC_KEY_FALSE(sched_smt_present);
EXPORT_SYMBOL_GPL(sched_smt_present);
@@ -6064,74 +6067,51 @@ unlock:
* there are no idle cores left in the system; tracked through
* sd_llc->shared->has_idle_cores and enabled through update_idle_core() above.
*/
-static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target)
+static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpus, int *idle_cpu)
{
- struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
- int core, cpu;
+ bool idle = true;
+ int cpu;
if (!static_branch_likely(&sched_smt_present))
- return -1;
+ return __select_idle_cpu(core);
- if (!test_idle_cores(target, false))
- return -1;
-
- cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
-
- for_each_cpu_wrap(core, cpus, target) {
- bool idle = true;
-
- for_each_cpu(cpu, cpu_smt_mask(core)) {
- if (!available_idle_cpu(cpu)) {
- idle = false;
- break;
+ for_each_cpu(cpu, cpu_smt_mask(core)) {
+ if (!available_idle_cpu(cpu)) {
+ idle = false;
+ if (*idle_cpu == -1) {
+ if (sched_idle_cpu(cpu) && cpumask_test_cpu(cpu, p->cpus_ptr)) {
+ *idle_cpu = cpu;
+ break;
+ }
+ continue;
}
+ break;
}
-
- if (idle)
- return core;
-
- cpumask_andnot(cpus, cpus, cpu_smt_mask(core));
+ if (*idle_cpu == -1 && cpumask_test_cpu(cpu, p->cpus_ptr))
+ *idle_cpu = cpu;
}
- /*
- * Failed to find an idle core; stop looking for one.
- */
- set_idle_cores(target, 0);
+ if (idle)
+ return core;
+ cpumask_andnot(cpus, cpus, cpu_smt_mask(core));
return -1;
}
-/*
- * Scan the local SMT mask for idle CPUs.
- */
-static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
-{
- int cpu;
-
- if (!static_branch_likely(&sched_smt_present))
- return -1;
-
- for_each_cpu(cpu, cpu_smt_mask(target)) {
- if (!cpumask_test_cpu(cpu, p->cpus_ptr) ||
- !cpumask_test_cpu(cpu, sched_domain_span(sd)))
- continue;
- if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
- return cpu;
- }
+#else /* CONFIG_SCHED_SMT */
- return -1;
+static inline void set_idle_cores(int cpu, int val)
+{
}
-#else /* CONFIG_SCHED_SMT */
-
-static inline int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target)
+static inline bool test_idle_cores(int cpu, bool def)
{
- return -1;
+ return def;
}
-static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
+static inline int select_idle_core(struct task_struct *p, int core, struct cpumask *cpus, int *idle_cpu)
{
- return -1;
+ return __select_idle_cpu(core);
}
#endif /* CONFIG_SCHED_SMT */
@@ -6144,49 +6124,61 @@ static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd
static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target)
{
struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
+ int i, cpu, idle_cpu = -1, nr = INT_MAX;
+ bool smt = test_idle_cores(target, false);
+ int this = smp_processor_id();
struct sched_domain *this_sd;
- u64 avg_cost, avg_idle;
u64 time;
- int this = smp_processor_id();
- int cpu, nr = INT_MAX;
this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
if (!this_sd)
return -1;
- /*
- * Due to large variance we need a large fuzz factor; hackbench in
- * particularly is sensitive here.
- */
- avg_idle = this_rq()->avg_idle / 512;
- avg_cost = this_sd->avg_scan_cost + 1;
+ cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
- if (sched_feat(SIS_AVG_CPU) && avg_idle < avg_cost)
- return -1;
+ if (sched_feat(SIS_PROP) && !smt) {
+ u64 avg_cost, avg_idle, span_avg;
+
+ /*
+ * Due to large variance we need a large fuzz factor;
+ * hackbench in particularly is sensitive here.
+ */
+ avg_idle = this_rq()->avg_idle / 512;
+ avg_cost = this_sd->avg_scan_cost + 1;
- if (sched_feat(SIS_PROP)) {
- u64 span_avg = sd->span_weight * avg_idle;
+ span_avg = sd->span_weight * avg_idle;
if (span_avg > 4*avg_cost)
nr = div_u64(span_avg, avg_cost);
else
nr = 4;
- }
-
- time = cpu_clock(this);
- cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
+ time = cpu_clock(this);
+ }
for_each_cpu_wrap(cpu, cpus, target) {
- if (!--nr)
- return -1;
- if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
- break;
+ if (smt) {
+ i = select_idle_core(p, cpu, cpus, &idle_cpu);
+ if ((unsigned int)i < nr_cpumask_bits)
+ return i;
+
+ } else {
+ if (!--nr)
+ return -1;
+ idle_cpu = __select_idle_cpu(cpu);
+ if ((unsigned int)idle_cpu < nr_cpumask_bits)
+ break;
+ }
}
- time = cpu_clock(this) - time;
- update_avg(&this_sd->avg_scan_cost, time);
+ if (smt)
+ set_idle_cores(this, false);
- return cpu;
+ if (sched_feat(SIS_PROP) && !smt) {
+ time = cpu_clock(this) - time;
+ update_avg(&this_sd->avg_scan_cost, time);
+ }
+
+ return idle_cpu;
}
/*
@@ -6315,18 +6307,10 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
if (!sd)
return target;
- i = select_idle_core(p, sd, target);
- if ((unsigned)i < nr_cpumask_bits)
- return i;
-
i = select_idle_cpu(p, sd, target);
if ((unsigned)i < nr_cpumask_bits)
return i;
- i = select_idle_smt(p, sd, target);
- if ((unsigned)i < nr_cpumask_bits)
- return i;
-
return target;
}
@@ -6543,7 +6527,7 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
* is already enough to scale the EM reported power
* consumption at the (eventually clamped) cpu_capacity.
*/
- sum_util += schedutil_cpu_util(cpu, util_cfs, cpu_cap,
+ sum_util += effective_cpu_util(cpu, util_cfs, cpu_cap,
ENERGY_UTIL, NULL);
/*
@@ -6553,7 +6537,7 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
* NOTE: in case RT tasks are running, by default the
* FREQUENCY_UTIL's utilization can be max OPP.
*/
- cpu_util = schedutil_cpu_util(cpu, util_cfs, cpu_cap,
+ cpu_util = effective_cpu_util(cpu, util_cfs, cpu_cap,
FREQUENCY_UTIL, tsk);
max_util = max(max_util, cpu_util);
}
@@ -6651,7 +6635,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
* IOW, placing the task there would make the CPU
* overutilized. Take uclamp into account to see how
* much capacity we can get out of the CPU; this is
- * aligned with schedutil_cpu_util().
+ * aligned with sched_cpu_util().
*/
util = uclamp_rq_util_with(cpu_rq(cpu), util, p);
if (!fits_capacity(util, cpu_cap))
@@ -7132,7 +7116,7 @@ done: __maybe_unused;
list_move(&p->se.group_node, &rq->cfs_tasks);
#endif
- if (hrtick_enabled(rq))
+ if (hrtick_enabled_fair(rq))
hrtick_start_fair(rq, p);
update_misfit_status(p, rq);
@@ -9389,8 +9373,11 @@ static struct rq *find_busiest_queue(struct lb_env *env,
if (rt > env->fbq_type)
continue;
- capacity = capacity_of(i);
nr_running = rq->cfs.h_nr_running;
+ if (!nr_running)
+ continue;
+
+ capacity = capacity_of(i);
/*
* For ASYM_CPUCAPACITY domains, don't pick a CPU that could
@@ -9496,13 +9483,32 @@ asym_active_balance(struct lb_env *env)
}
static inline bool
-voluntary_active_balance(struct lb_env *env)
+imbalanced_active_balance(struct lb_env *env)
+{
+ struct sched_domain *sd = env->sd;
+
+ /*
+ * The imbalanced case includes the case of pinned tasks preventing a fair
+ * distribution of the load on the system but also the even distribution of the
+ * threads on a system with spare capacity
+ */
+ if ((env->migration_type == migrate_task) &&
+ (sd->nr_balance_failed > sd->cache_nice_tries+2))
+ return 1;
+
+ return 0;
+}
+
+static int need_active_balance(struct lb_env *env)
{
struct sched_domain *sd = env->sd;
if (asym_active_balance(env))
return 1;
+ if (imbalanced_active_balance(env))
+ return 1;
+
/*
* The dst_cpu is idle and the src_cpu CPU has only 1 CFS task.
* It's worth migrating the task if the src_cpu's capacity is reduced
@@ -9522,16 +9528,6 @@ voluntary_active_balance(struct lb_env *env)
return 0;
}
-static int need_active_balance(struct lb_env *env)
-{
- struct sched_domain *sd = env->sd;
-
- if (voluntary_active_balance(env))
- return 1;
-
- return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
-}
-
static int active_load_balance_cpu_stop(void *data);
static int should_we_balance(struct lb_env *env)
@@ -9623,6 +9619,8 @@ redo:
env.src_rq = busiest;
ld_moved = 0;
+ /* Clear this flag as soon as we find a pullable task */
+ env.flags |= LBF_ALL_PINNED;
if (busiest->nr_running > 1) {
/*
* Attempt to move tasks. If find_busiest_group has found
@@ -9630,7 +9628,6 @@ redo:
* still unbalanced. ld_moved simply stays zero, so it is
* correctly treated as an imbalance.
*/
- env.flags |= LBF_ALL_PINNED;
env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
more_balance:
@@ -9756,10 +9753,12 @@ more_balance:
if (!cpumask_test_cpu(this_cpu, busiest->curr->cpus_ptr)) {
raw_spin_unlock_irqrestore(&busiest->lock,
flags);
- env.flags |= LBF_ALL_PINNED;
goto out_one_pinned;
}
+ /* Record that we found at least one task that could run on this_cpu */
+ env.flags &= ~LBF_ALL_PINNED;
+
/*
* ->active_balance synchronizes accesses to
* ->active_balance_work. Once set, it's cleared
@@ -9781,21 +9780,13 @@ more_balance:
/* We've kicked active balancing, force task migration. */
sd->nr_balance_failed = sd->cache_nice_tries+1;
}
- } else
+ } else {
sd->nr_balance_failed = 0;
+ }
- if (likely(!active_balance) || voluntary_active_balance(&env)) {
+ if (likely(!active_balance) || need_active_balance(&env)) {
/* We were unbalanced, so reset the balancing interval */
sd->balance_interval = sd->min_interval;
- } else {
- /*
- * If we've begun active balancing, start to back off. This
- * case may not be covered by the all_pinned logic if there
- * is only 1 task on the busy runqueue (because we don't call
- * detach_tasks).
- */
- if (sd->balance_interval < sd->max_interval)
- sd->balance_interval *= 2;
}
goto out;
@@ -10700,8 +10691,11 @@ static __latent_entropy void run_rebalance_domains(struct softirq_action *h)
*/
void trigger_load_balance(struct rq *rq)
{
- /* Don't need to rebalance while attached to NULL domain */
- if (unlikely(on_null_domain(rq)))
+ /*
+ * Don't need to rebalance while attached to NULL domain or
+ * runqueue CPU is not active
+ */
+ if (unlikely(on_null_domain(rq) || !cpu_active(cpu_of(rq))))
return;
if (time_after_eq(jiffies, rq->next_balance))
@@ -10806,7 +10800,7 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
* our priority decreased, or if we are not currently running on
* this runqueue and our priority is higher than the current's
*/
- if (rq->curr == p) {
+ if (task_current(rq, p)) {
if (p->prio > oldprio)
resched_curr(rq);
} else
@@ -10939,7 +10933,7 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p)
* kick off the schedule if running, otherwise just see
* if we can still preempt the current task.
*/
- if (rq->curr == p)
+ if (task_current(rq, p))
resched_curr(rq);
else
check_preempt_curr(rq, p, 0);
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 68d369cba9e45..1bc2b158fc515 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -38,6 +38,7 @@ SCHED_FEAT(CACHE_HOT_BUDDY, true)
SCHED_FEAT(WAKEUP_PREEMPTION, true)
SCHED_FEAT(HRTICK, false)
+SCHED_FEAT(HRTICK_DL, false)
SCHED_FEAT(DOUBLE_TICK, false)
/*
@@ -54,7 +55,6 @@ SCHED_FEAT(TTWU_QUEUE, true)
/*
* When doing wakeups, attempt to limit superfluous scans of the LLC domain.
*/
-SCHED_FEAT(SIS_AVG_CPU, false)
SCHED_FEAT(SIS_PROP, true)
/*
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 305727ea06772..7199e6f23789e 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -285,6 +285,7 @@ static void do_idle(void)
}
arch_cpu_idle_enter();
+ rcu_nocb_flush_deferred_wakeup();
/*
* In poll mode we reenable interrupts and spin. Also if we
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index dbe4629cf7ba4..8f720b71d13dd 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -2357,7 +2357,7 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
if (!task_on_rq_queued(p))
return;
- if (rq->curr == p) {
+ if (task_current(rq, p)) {
#ifdef CONFIG_SMP
/*
* If our priority decreases while running, we
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index bb09988451a04..10a1522b1e303 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -140,7 +140,7 @@ extern void call_trace_sched_update_nr_running(struct rq *rq, int count);
* scale_load() and scale_load_down(w) to convert between them. The
* following must be true:
*
- * scale_load(sched_prio_to_weight[USER_PRIO(NICE_TO_PRIO(0))]) == NICE_0_LOAD
+ * scale_load(sched_prio_to_weight[NICE_TO_PRIO(0)-MAX_RT_PRIO]) == NICE_0_LOAD
*
*/
#define NICE_0_LOAD (1L << NICE_0_LOAD_SHIFT)
@@ -1031,6 +1031,7 @@ struct rq {
call_single_data_t hrtick_csd;
#endif
struct hrtimer hrtick_timer;
+ ktime_t hrtick_time;
#endif
#ifdef CONFIG_SCHEDSTATS
@@ -2104,17 +2105,39 @@ extern const_debug unsigned int sysctl_sched_migration_cost;
*/
static inline int hrtick_enabled(struct rq *rq)
{
- if (!sched_feat(HRTICK))
- return 0;
if (!cpu_active(cpu_of(rq)))
return 0;
return hrtimer_is_hres_active(&rq->hrtick_timer);
}
+static inline int hrtick_enabled_fair(struct rq *rq)
+{
+ if (!sched_feat(HRTICK))
+ return 0;
+ return hrtick_enabled(rq);
+}
+
+static inline int hrtick_enabled_dl(struct rq *rq)
+{
+ if (!sched_feat(HRTICK_DL))
+ return 0;
+ return hrtick_enabled(rq);
+}
+
void hrtick_start(struct rq *rq, u64 delay);
#else
+static inline int hrtick_enabled_fair(struct rq *rq)
+{
+ return 0;
+}
+
+static inline int hrtick_enabled_dl(struct rq *rq)
+{
+ return 0;
+}
+
static inline int hrtick_enabled(struct rq *rq)
{
return 0;
@@ -2558,27 +2581,24 @@ static inline unsigned long capacity_orig_of(int cpu)
{
return cpu_rq(cpu)->cpu_capacity_orig;
}
-#endif
/**
- * enum schedutil_type - CPU utilization type
+ * enum cpu_util_type - CPU utilization type
* @FREQUENCY_UTIL: Utilization used to select frequency
* @ENERGY_UTIL: Utilization used during energy calculation
*
* The utilization signals of all scheduling classes (CFS/RT/DL) and IRQ time
* need to be aggregated differently depending on the usage made of them. This
- * enum is used within schedutil_freq_util() to differentiate the types of
+ * enum is used within effective_cpu_util() to differentiate the types of
* utilization expected by the callers, and adjust the aggregation accordingly.
*/
-enum schedutil_type {
+enum cpu_util_type {
FREQUENCY_UTIL,
ENERGY_UTIL,
};
-#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL
-
-unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs,
- unsigned long max, enum schedutil_type type,
+unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
+ unsigned long max, enum cpu_util_type type,
struct task_struct *p);
static inline unsigned long cpu_bw_dl(struct rq *rq)
@@ -2607,14 +2627,7 @@ static inline unsigned long cpu_util_rt(struct rq *rq)
{
return READ_ONCE(rq->avg_rt.util_avg);
}
-#else /* CONFIG_CPU_FREQ_GOV_SCHEDUTIL */
-static inline unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs,
- unsigned long max, enum schedutil_type type,
- struct task_struct *p)
-{
- return 0;
-}
-#endif /* CONFIG_CPU_FREQ_GOV_SCHEDUTIL */
+#endif
#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
static inline unsigned long cpu_util_irq(struct rq *rq)
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 5d3675c7a76be..09d35044bd889 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1596,66 +1596,58 @@ static void init_numa_topology_type(void)
}
}
+
+#define NR_DISTANCE_VALUES (1 << DISTANCE_BITS)
+
void sched_init_numa(void)
{
- int next_distance, curr_distance = node_distance(0, 0);
struct sched_domain_topology_level *tl;
- int level = 0;
- int i, j, k;
-
- sched_domains_numa_distance = kzalloc(sizeof(int) * (nr_node_ids + 1), GFP_KERNEL);
- if (!sched_domains_numa_distance)
- return;
-
- /* Includes NUMA identity node at level 0. */
- sched_domains_numa_distance[level++] = curr_distance;
- sched_domains_numa_levels = level;
+ unsigned long *distance_map;
+ int nr_levels = 0;
+ int i, j;
/*
* O(nr_nodes^2) deduplicating selection sort -- in order to find the
* unique distances in the node_distance() table.
- *
- * Assumes node_distance(0,j) includes all distances in
- * node_distance(i,j) in order to avoid cubic time.
*/
- next_distance = curr_distance;
+ distance_map = bitmap_alloc(NR_DISTANCE_VALUES, GFP_KERNEL);
+ if (!distance_map)
+ return;
+
+ bitmap_zero(distance_map, NR_DISTANCE_VALUES);
for (i = 0; i < nr_node_ids; i++) {
for (j = 0; j < nr_node_ids; j++) {
- for (k = 0; k < nr_node_ids; k++) {
- int distance = node_distance(i, k);
-
- if (distance > curr_distance &&
- (distance < next_distance ||
- next_distance == curr_distance))
- next_distance = distance;
-
- /*
- * While not a strong assumption it would be nice to know
- * about cases where if node A is connected to B, B is not
- * equally connected to A.
- */
- if (sched_debug() && node_distance(k, i) != distance)
- sched_numa_warn("Node-distance not symmetric");
+ int distance = node_distance(i, j);
- if (sched_debug() && i && !find_numa_distance(distance))
- sched_numa_warn("Node-0 not representative");
+ if (distance < LOCAL_DISTANCE || distance >= NR_DISTANCE_VALUES) {
+ sched_numa_warn("Invalid distance value range");
+ return;
}
- if (next_distance != curr_distance) {
- sched_domains_numa_distance[level++] = next_distance;
- sched_domains_numa_levels = level;
- curr_distance = next_distance;
- } else break;
+
+ bitmap_set(distance_map, distance, 1);
}
+ }
+ /*
+ * We can now figure out how many unique distance values there are and
+ * allocate memory accordingly.
+ */
+ nr_levels = bitmap_weight(distance_map, NR_DISTANCE_VALUES);
- /*
- * In case of sched_debug() we verify the above assumption.
- */
- if (!sched_debug())
- break;
+ sched_domains_numa_distance = kcalloc(nr_levels, sizeof(int), GFP_KERNEL);
+ if (!sched_domains_numa_distance) {
+ bitmap_free(distance_map);
+ return;
+ }
+
+ for (i = 0, j = 0; i < nr_levels; i++, j++) {
+ j = find_next_bit(distance_map, NR_DISTANCE_VALUES, j);
+ sched_domains_numa_distance[i] = j;
}
+ bitmap_free(distance_map);
+
/*
- * 'level' contains the number of unique distances
+ * 'nr_levels' contains the number of unique distances
*
* The sched_domains_numa_distance[] array includes the actual distance
* numbers.
@@ -1664,15 +1656,15 @@ void sched_init_numa(void)
/*
* Here, we should temporarily reset sched_domains_numa_levels to 0.
* If it fails to allocate memory for array sched_domains_numa_masks[][],
- * the array will contain less then 'level' members. This could be
+ * the array will contain less then 'nr_levels' members. This could be
* dangerous when we use it to iterate array sched_domains_numa_masks[][]
* in other functions.
*
- * We reset it to 'level' at the end of this function.
+ * We reset it to 'nr_levels' at the end of this function.
*/
sched_domains_numa_levels = 0;
- sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL);
+ sched_domains_numa_masks = kzalloc(sizeof(void *) * nr_levels, GFP_KERNEL);
if (!sched_domains_numa_masks)
return;
@@ -1680,7 +1672,7 @@ void sched_init_numa(void)
* Now for each level, construct a mask per node which contains all
* CPUs of nodes that are that many hops away from us.
*/
- for (i = 0; i < level; i++) {
+ for (i = 0; i < nr_levels; i++) {
sched_domains_numa_masks[i] =
kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
if (!sched_domains_numa_masks[i])
@@ -1688,12 +1680,17 @@ void sched_init_numa(void)
for (j = 0; j < nr_node_ids; j++) {
struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL);
+ int k;
+
if (!mask)
return;
sched_domains_numa_masks[i][j] = mask;
for_each_node(k) {
+ if (sched_debug() && (node_distance(j, k) != node_distance(k, j)))
+ sched_numa_warn("Node-distance not symmetric");
+
if (node_distance(j, k) > sched_domains_numa_distance[i])
continue;
@@ -1705,7 +1702,7 @@ void sched_init_numa(void)
/* Compute default topology size */
for (i = 0; sched_domain_topology[i].mask; i++);
- tl = kzalloc((i + level + 1) *
+ tl = kzalloc((i + nr_levels + 1) *
sizeof(struct sched_domain_topology_level), GFP_KERNEL);
if (!tl)
return;
@@ -1728,7 +1725,7 @@ void sched_init_numa(void)
/*
* .. and append 'j' levels of NUMA goodness.
*/
- for (j = 1; j < level; i++, j++) {
+ for (j = 1; j < nr_levels; i++, j++) {
tl[i] = (struct sched_domain_topology_level){
.mask = sd_numa_mask,
.sd_flags = cpu_numa_flags,
@@ -1740,8 +1737,8 @@ void sched_init_numa(void)
sched_domain_topology = tl;
- sched_domains_numa_levels = level;
- sched_max_numa_distance = sched_domains_numa_distance[level - 1];
+ sched_domains_numa_levels = nr_levels;
+ sched_max_numa_distance = sched_domains_numa_distance[nr_levels - 1];
init_numa_topology_type();
}
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 9d71046ea247d..9908ec4a9bfed 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -26,6 +26,8 @@
#include <linux/tick.h>
#include <linux/irq.h>
+#include <asm/softirq_stack.h>
+
#define CREATE_TRACE_POINTS
#include <trace/events/irq.h>
diff --git a/kernel/static_call.c b/kernel/static_call.c
index 84565c2a41b8f..6906c6ec4c97d 100644
--- a/kernel/static_call.c
+++ b/kernel/static_call.c
@@ -12,6 +12,8 @@
extern struct static_call_site __start_static_call_sites[],
__stop_static_call_sites[];
+extern struct static_call_tramp_key __start_static_call_tramp_key[],
+ __stop_static_call_tramp_key[];
static bool static_call_initialized;
@@ -323,10 +325,59 @@ static int __static_call_mod_text_reserved(void *start, void *end)
return ret;
}
+static unsigned long tramp_key_lookup(unsigned long addr)
+{
+ struct static_call_tramp_key *start = __start_static_call_tramp_key;
+ struct static_call_tramp_key *stop = __stop_static_call_tramp_key;
+ struct static_call_tramp_key *tramp_key;
+
+ for (tramp_key = start; tramp_key != stop; tramp_key++) {
+ unsigned long tramp;
+
+ tramp = (long)tramp_key->tramp + (long)&tramp_key->tramp;
+ if (tramp == addr)
+ return (long)tramp_key->key + (long)&tramp_key->key;
+ }
+
+ return 0;
+}
+
static int static_call_add_module(struct module *mod)
{
- return __static_call_init(mod, mod->static_call_sites,
- mod->static_call_sites + mod->num_static_call_sites);
+ struct static_call_site *start = mod->static_call_sites;
+ struct static_call_site *stop = start + mod->num_static_call_sites;
+ struct static_call_site *site;
+
+ for (site = start; site != stop; site++) {
+ unsigned long addr = (unsigned long)static_call_key(site);
+ unsigned long key;
+
+ /*
+ * Is the key is exported, 'addr' points to the key, which
+ * means modules are allowed to call static_call_update() on
+ * it.
+ *
+ * Otherwise, the key isn't exported, and 'addr' points to the
+ * trampoline so we need to lookup the key.
+ *
+ * We go through this dance to prevent crazy modules from
+ * abusing sensitive static calls.
+ */
+ if (!kernel_text_address(addr))
+ continue;
+
+ key = tramp_key_lookup(addr);
+ if (!key) {
+ pr_warn("Failed to fixup __raw_static_call() usage at: %ps\n",
+ static_call_addr(site));
+ return -EINVAL;
+ }
+
+ site->key = (key - (long)&site->key) |
+ (site->key & STATIC_CALL_SITE_FLAGS);
+ }
+
+ return __static_call_init(mod, start, stop);
}
static void static_call_del_module(struct module *mod)
@@ -438,6 +489,11 @@ int __init static_call_init(void)
}
early_initcall(static_call_init);
+long __static_call_return0(void)
+{
+ return 0;
+}
+
#ifdef CONFIG_STATIC_CALL_SELFTEST
static int func_a(int x)
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index f4ace1bf83828..98d7a15e8cf69 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -527,8 +527,11 @@ static enum alarmtimer_type clock2alarm(clockid_t clockid)
/**
* alarm_handle_timer - Callback for posix timers
* @alarm: alarm that fired
+ * @now: time at the timer expiration
*
* Posix timer callback for expired alarm timers.
+ *
+ * Return: whether the timer is to be restarted
*/
static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm,
ktime_t now)
@@ -715,8 +718,11 @@ static int alarm_timer_create(struct k_itimer *new_timer)
/**
* alarmtimer_nsleep_wakeup - Wakeup function for alarm_timer_nsleep
* @alarm: ptr to alarm that fired
+ * @now: time at the timer expiration
*
* Wakes up the task that set the alarmtimer
+ *
+ * Return: ALARMTIMER_NORESTART
*/
static enum alarmtimer_restart alarmtimer_nsleep_wakeup(struct alarm *alarm,
ktime_t now)
@@ -733,6 +739,7 @@ static enum alarmtimer_restart alarmtimer_nsleep_wakeup(struct alarm *alarm,
* alarmtimer_do_nsleep - Internal alarmtimer nsleep implementation
* @alarm: ptr to alarmtimer
* @absexp: absolute expiration time
+ * @type: alarm type (BOOTTIME/REALTIME).
*
* Sets the alarm timer and sleeps until it is fired or interrupted.
*/
@@ -806,7 +813,6 @@ static long __sched alarm_timer_nsleep_restart(struct restart_block *restart)
* @which_clock: clockid
* @flags: determins abstime or relative
* @tsreq: requested sleep time (abs or rel)
- * @rmtp: remaining sleep time saved
*
* Handles clock_nanosleep calls against _ALARM clockids
*/
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 743c852e10f23..88a0145d86c07 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -626,24 +626,30 @@ static inline int hrtimer_hres_active(void)
static void
hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
{
- ktime_t expires_next;
+ ktime_t expires_next, soft = KTIME_MAX;
/*
- * Find the current next expiration time.
+ * If the soft interrupt has already been activated, ignore the
+ * soft bases. They will be handled in the already raised soft
+ * interrupt.
*/
- expires_next = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_ALL);
-
- if (cpu_base->next_timer && cpu_base->next_timer->is_soft) {
+ if (!cpu_base->softirq_activated) {
+ soft = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_SOFT);
/*
- * When the softirq is activated, hrtimer has to be
- * programmed with the first hard hrtimer because soft
- * timer interrupt could occur too late.
+ * Update the soft expiry time. clock_settime() might have
+ * affected it.
*/
- if (cpu_base->softirq_activated)
- expires_next = __hrtimer_get_next_event(cpu_base,
- HRTIMER_ACTIVE_HARD);
- else
- cpu_base->softirq_expires_next = expires_next;
+ cpu_base->softirq_expires_next = soft;
+ }
+
+ expires_next = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_HARD);
+ /*
+ * If a softirq timer is expiring first, update cpu_base->next_timer
+ * and program the hardware with the soft expiry time.
+ */
+ if (expires_next > soft) {
+ cpu_base->next_timer = cpu_base->softirq_next_timer;
+ expires_next = soft;
}
if (skip_equal && expires_next == cpu_base->expires_next)
diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c
index 6ca625f5e5544..12eab0d2ae28d 100644
--- a/kernel/time/namespace.c
+++ b/kernel/time/namespace.c
@@ -465,9 +465,3 @@ struct time_namespace init_time_ns = {
.ns.ops = &timens_operations,
.frozen_offsets = true,
};
-
-static int __init time_ns_init(void)
-{
- return 0;
-}
-subsys_initcall(time_ns_init);
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 8dbc008f8942b..f475f1a027c8b 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1237,6 +1237,20 @@ int try_to_del_timer_sync(struct timer_list *timer)
}
EXPORT_SYMBOL(try_to_del_timer_sync);
+bool timer_curr_running(struct timer_list *timer)
+{
+ int i;
+
+ for (i = 0; i < NR_BASES; i++) {
+ struct timer_base *base = this_cpu_ptr(&timer_bases[i]);
+
+ if (base->running_timer == timer)
+ return true;
+ }
+
+ return false;
+}
+
#ifdef CONFIG_PREEMPT_RT
static __init void timer_base_init_expiry_lock(struct timer_base *base)
{
diff --git a/kernel/torture.c b/kernel/torture.c
index 8562ac18d2eb5..01e336f1e5b20 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -48,6 +48,12 @@ module_param(disable_onoff_at_boot, bool, 0444);
static bool ftrace_dump_at_shutdown;
module_param(ftrace_dump_at_shutdown, bool, 0444);
+static int verbose_sleep_frequency;
+module_param(verbose_sleep_frequency, int, 0444);
+
+static int verbose_sleep_duration = 1;
+module_param(verbose_sleep_duration, int, 0444);
+
static char *torture_type;
static int verbose;
@@ -58,6 +64,95 @@ static int verbose;
static int fullstop = FULLSTOP_RMMOD;
static DEFINE_MUTEX(fullstop_mutex);
+static atomic_t verbose_sleep_counter;
+
+/*
+ * Sleep if needed from VERBOSE_TOROUT*().
+ */
+void verbose_torout_sleep(void)
+{
+ if (verbose_sleep_frequency > 0 &&
+ verbose_sleep_duration > 0 &&
+ !(atomic_inc_return(&verbose_sleep_counter) % verbose_sleep_frequency))
+ schedule_timeout_uninterruptible(verbose_sleep_duration);
+}
+EXPORT_SYMBOL_GPL(verbose_torout_sleep);
+
+/*
+ * Schedule a high-resolution-timer sleep in nanoseconds, with a 32-bit
+ * nanosecond random fuzz. This function and its friends desynchronize
+ * testing from the timer wheel.
+ */
+int torture_hrtimeout_ns(ktime_t baset_ns, u32 fuzzt_ns, struct torture_random_state *trsp)
+{
+ ktime_t hto = baset_ns;
+
+ if (trsp)
+ hto += (torture_random(trsp) >> 3) % fuzzt_ns;
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ return schedule_hrtimeout(&hto, HRTIMER_MODE_REL);
+}
+EXPORT_SYMBOL_GPL(torture_hrtimeout_ns);
+
+/*
+ * Schedule a high-resolution-timer sleep in microseconds, with a 32-bit
+ * nanosecond (not microsecond!) random fuzz.
+ */
+int torture_hrtimeout_us(u32 baset_us, u32 fuzzt_ns, struct torture_random_state *trsp)
+{
+ ktime_t baset_ns = baset_us * NSEC_PER_USEC;
+
+ return torture_hrtimeout_ns(baset_ns, fuzzt_ns, trsp);
+}
+EXPORT_SYMBOL_GPL(torture_hrtimeout_us);
+
+/*
+ * Schedule a high-resolution-timer sleep in milliseconds, with a 32-bit
+ * microsecond (not millisecond!) random fuzz.
+ */
+int torture_hrtimeout_ms(u32 baset_ms, u32 fuzzt_us, struct torture_random_state *trsp)
+{
+ ktime_t baset_ns = baset_ms * NSEC_PER_MSEC;
+ u32 fuzzt_ns;
+
+ if ((u32)~0U / NSEC_PER_USEC < fuzzt_us)
+ fuzzt_ns = (u32)~0U;
+ else
+ fuzzt_ns = fuzzt_us * NSEC_PER_USEC;
+ return torture_hrtimeout_ns(baset_ns, fuzzt_ns, trsp);
+}
+EXPORT_SYMBOL_GPL(torture_hrtimeout_ms);
+
+/*
+ * Schedule a high-resolution-timer sleep in jiffies, with an
+ * implied one-jiffy random fuzz. This is intended to replace calls to
+ * schedule_timeout_interruptible() and friends.
+ */
+int torture_hrtimeout_jiffies(u32 baset_j, struct torture_random_state *trsp)
+{
+ ktime_t baset_ns = jiffies_to_nsecs(baset_j);
+
+ return torture_hrtimeout_ns(baset_ns, jiffies_to_nsecs(1), trsp);
+}
+EXPORT_SYMBOL_GPL(torture_hrtimeout_jiffies);
+
+/*
+ * Schedule a high-resolution-timer sleep in milliseconds, with a 32-bit
+ * millisecond (not second!) random fuzz.
+ */
+int torture_hrtimeout_s(u32 baset_s, u32 fuzzt_ms, struct torture_random_state *trsp)
+{
+ ktime_t baset_ns = baset_s * NSEC_PER_SEC;
+ u32 fuzzt_ns;
+
+ if ((u32)~0U / NSEC_PER_MSEC < fuzzt_ms)
+ fuzzt_ns = (u32)~0U;
+ else
+ fuzzt_ns = fuzzt_ms * NSEC_PER_MSEC;
+ return torture_hrtimeout_ns(baset_ns, fuzzt_ns, trsp);
+}
+EXPORT_SYMBOL_GPL(torture_hrtimeout_s);
+
#ifdef CONFIG_HOTPLUG_CPU
/*
@@ -80,6 +175,19 @@ static unsigned long sum_online;
static int min_online = -1;
static int max_online;
+static int torture_online_cpus = NR_CPUS;
+
+/*
+ * Some torture testing leverages confusion as to the number of online
+ * CPUs. This function returns the torture-testing view of this number,
+ * which allows torture tests to load-balance appropriately.
+ */
+int torture_num_online_cpus(void)
+{
+ return READ_ONCE(torture_online_cpus);
+}
+EXPORT_SYMBOL_GPL(torture_num_online_cpus);
+
/*
* Attempt to take a CPU offline. Return false if the CPU is already
* offline or if it is not subject to CPU-hotplug operations. The
@@ -134,6 +242,8 @@ bool torture_offline(int cpu, long *n_offl_attempts, long *n_offl_successes,
*min_offl = delta;
if (*max_offl < delta)
*max_offl = delta;
+ WRITE_ONCE(torture_online_cpus, torture_online_cpus - 1);
+ WARN_ON_ONCE(torture_online_cpus <= 0);
}
return true;
@@ -190,6 +300,7 @@ bool torture_online(int cpu, long *n_onl_attempts, long *n_onl_successes,
*min_onl = delta;
if (*max_onl < delta)
*max_onl = delta;
+ WRITE_ONCE(torture_online_cpus, torture_online_cpus + 1);
}
return true;
@@ -197,6 +308,26 @@ bool torture_online(int cpu, long *n_onl_attempts, long *n_onl_successes,
EXPORT_SYMBOL_GPL(torture_online);
/*
+ * Get everything online at the beginning and ends of tests.
+ */
+static void torture_online_all(char *phase)
+{
+ int cpu;
+ int ret;
+
+ for_each_possible_cpu(cpu) {
+ if (cpu_online(cpu))
+ continue;
+ ret = add_cpu(cpu);
+ if (ret && verbose) {
+ pr_alert("%s" TORTURE_FLAG
+ "%s: %s online %d: errno %d\n",
+ __func__, phase, torture_type, cpu, ret);
+ }
+ }
+}
+
+/*
* Execute random CPU-hotplug operations at the interval specified
* by the onoff_interval.
*/
@@ -206,25 +337,12 @@ torture_onoff(void *arg)
int cpu;
int maxcpu = -1;
DEFINE_TORTURE_RANDOM(rand);
- int ret;
VERBOSE_TOROUT_STRING("torture_onoff task started");
for_each_online_cpu(cpu)
maxcpu = cpu;
WARN_ON(maxcpu < 0);
- if (!IS_MODULE(CONFIG_TORTURE_TEST)) {
- for_each_possible_cpu(cpu) {
- if (cpu_online(cpu))
- continue;
- ret = add_cpu(cpu);
- if (ret && verbose) {
- pr_alert("%s" TORTURE_FLAG
- "%s: Initial online %d: errno %d\n",
- __func__, torture_type, cpu, ret);
- }
- }
- }
-
+ torture_online_all("Initial");
if (maxcpu == 0) {
VERBOSE_TOROUT_STRING("Only one CPU, so CPU-hotplug testing is disabled");
goto stop;
@@ -252,6 +370,7 @@ torture_onoff(void *arg)
stop:
torture_kthread_stopping("torture_onoff");
+ torture_online_all("Final");
return 0;
}
@@ -602,7 +721,6 @@ static int stutter_gap;
*/
bool stutter_wait(const char *title)
{
- ktime_t delay;
unsigned int i = 0;
bool ret = false;
int spt;
@@ -618,11 +736,8 @@ bool stutter_wait(const char *title)
schedule_timeout_interruptible(1);
} else if (spt == 2) {
while (READ_ONCE(stutter_pause_test)) {
- if (!(i++ & 0xffff)) {
- set_current_state(TASK_INTERRUPTIBLE);
- delay = 10 * NSEC_PER_USEC;
- schedule_hrtimeout(&delay, HRTIMER_MODE_REL);
- }
+ if (!(i++ & 0xffff))
+ torture_hrtimeout_us(10, 0, NULL);
cond_resched();
}
} else {
@@ -640,7 +755,6 @@ EXPORT_SYMBOL_GPL(stutter_wait);
*/
static int torture_stutter(void *arg)
{
- ktime_t delay;
DEFINE_TORTURE_RANDOM(rand);
int wtime;
@@ -651,20 +765,15 @@ static int torture_stutter(void *arg)
if (stutter > 2) {
WRITE_ONCE(stutter_pause_test, 1);
wtime = stutter - 3;
- delay = ktime_divns(NSEC_PER_SEC * wtime, HZ);
- delay += (torture_random(&rand) >> 3) % NSEC_PER_MSEC;
- set_current_state(TASK_INTERRUPTIBLE);
- schedule_hrtimeout(&delay, HRTIMER_MODE_REL);
+ torture_hrtimeout_jiffies(wtime, &rand);
wtime = 2;
}
WRITE_ONCE(stutter_pause_test, 2);
- delay = ktime_divns(NSEC_PER_SEC * wtime, HZ);
- set_current_state(TASK_INTERRUPTIBLE);
- schedule_hrtimeout(&delay, HRTIMER_MODE_REL);
+ torture_hrtimeout_jiffies(wtime, NULL);
}
WRITE_ONCE(stutter_pause_test, 0);
if (!torture_must_stop())
- schedule_timeout_interruptible(stutter_gap);
+ torture_hrtimeout_jiffies(stutter_gap, NULL);
torture_shutdown_absorb("torture_stutter");
} while (!torture_must_stop());
torture_kthread_stopping("torture_stutter");
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 9cf4d12b81fbf..1558be8680a6b 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1334,6 +1334,7 @@ config LOCKDEP_SMALL
config DEBUG_LOCKDEP
bool "Lock dependency engine debugging"
depends on DEBUG_KERNEL && LOCKDEP
+ select DEBUG_IRQFLAGS
help
If you say Y here, the lock dependency engine will do
additional runtime checks to debug itself, at the price
@@ -1422,6 +1423,13 @@ config TRACE_IRQFLAGS_NMI
depends on TRACE_IRQFLAGS
depends on TRACE_IRQFLAGS_NMI_SUPPORT
+config DEBUG_IRQFLAGS
+ bool "Debug IRQ flag manipulation"
+ help
+ Enables checks for potentially unsafe enabling or disabling of
+ interrupts, such as calling raw_local_irq_restore() when interrupts
+ are enabled.
+
config STACKTRACE
bool "Stack backtrace support"
depends on STACKTRACE_SUPPORT
diff --git a/lib/Makefile b/lib/Makefile
index a6b160c3a4fac..fb7d946bb8c3e 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -27,9 +27,6 @@ KASAN_SANITIZE_string.o := n
CFLAGS_string.o += -fno-stack-protector
endif
-# Used by KCSAN while enabled, avoid recursion.
-KCSAN_SANITIZE_random32.o := n
-
lib-y := ctype.o string.o vsprintf.o cmdline.o \
rbtree.o radix-tree.o timerqueue.o xarray.o \
idr.o extable.o sha1.o irq_regs.o argv_split.o \
diff --git a/lib/locking-selftest.c b/lib/locking-selftest.c
index 9959ea23529e8..2d85abac17448 100644
--- a/lib/locking-selftest.c
+++ b/lib/locking-selftest.c
@@ -24,6 +24,7 @@
#include <linux/debug_locks.h>
#include <linux/irqflags.h>
#include <linux/rtmutex.h>
+#include <linux/local_lock.h>
/*
* Change this to 1 if you want to see the failure printouts:
@@ -51,6 +52,7 @@ __setup("debug_locks_verbose=", setup_debug_locks_verbose);
#define LOCKTYPE_RWSEM 0x8
#define LOCKTYPE_WW 0x10
#define LOCKTYPE_RTMUTEX 0x20
+#define LOCKTYPE_LL 0x40
static struct ww_acquire_ctx t, t2;
static struct ww_mutex o, o2, o3;
@@ -64,6 +66,9 @@ static DEFINE_SPINLOCK(lock_B);
static DEFINE_SPINLOCK(lock_C);
static DEFINE_SPINLOCK(lock_D);
+static DEFINE_RAW_SPINLOCK(raw_lock_A);
+static DEFINE_RAW_SPINLOCK(raw_lock_B);
+
static DEFINE_RWLOCK(rwlock_A);
static DEFINE_RWLOCK(rwlock_B);
static DEFINE_RWLOCK(rwlock_C);
@@ -133,6 +138,8 @@ static DEFINE_RT_MUTEX(rtmutex_Z2);
#endif
+static local_lock_t local_A = INIT_LOCAL_LOCK(local_A);
+
/*
* non-inlined runtime initializers, to let separate locks share
* the same lock-class:
@@ -1306,19 +1313,23 @@ GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion3_soft_wlock)
#ifdef CONFIG_DEBUG_LOCK_ALLOC
# define I_SPINLOCK(x) lockdep_reset_lock(&lock_##x.dep_map)
+# define I_RAW_SPINLOCK(x) lockdep_reset_lock(&raw_lock_##x.dep_map)
# define I_RWLOCK(x) lockdep_reset_lock(&rwlock_##x.dep_map)
# define I_MUTEX(x) lockdep_reset_lock(&mutex_##x.dep_map)
# define I_RWSEM(x) lockdep_reset_lock(&rwsem_##x.dep_map)
# define I_WW(x) lockdep_reset_lock(&x.dep_map)
+# define I_LOCAL_LOCK(x) lockdep_reset_lock(&local_##x.dep_map)
#ifdef CONFIG_RT_MUTEXES
# define I_RTMUTEX(x) lockdep_reset_lock(&rtmutex_##x.dep_map)
#endif
#else
# define I_SPINLOCK(x)
+# define I_RAW_SPINLOCK(x)
# define I_RWLOCK(x)
# define I_MUTEX(x)
# define I_RWSEM(x)
# define I_WW(x)
+# define I_LOCAL_LOCK(x)
#endif
#ifndef I_RTMUTEX
@@ -1358,9 +1369,16 @@ static void reset_locks(void)
I1(A); I1(B); I1(C); I1(D);
I1(X1); I1(X2); I1(Y1); I1(Y2); I1(Z1); I1(Z2);
I_WW(t); I_WW(t2); I_WW(o.base); I_WW(o2.base); I_WW(o3.base);
+ I_RAW_SPINLOCK(A); I_RAW_SPINLOCK(B);
+ I_LOCAL_LOCK(A);
+
lockdep_reset();
+
I2(A); I2(B); I2(C); I2(D);
init_shared_classes();
+ raw_spin_lock_init(&raw_lock_A);
+ raw_spin_lock_init(&raw_lock_B);
+ local_lock_init(&local_A);
ww_mutex_init(&o, &ww_lockdep); ww_mutex_init(&o2, &ww_lockdep); ww_mutex_init(&o3, &ww_lockdep);
memset(&t, 0, sizeof(t)); memset(&t2, 0, sizeof(t2));
@@ -1382,6 +1400,8 @@ static void dotest(void (*testcase_fn)(void), int expected, int lockclass_mask)
WARN_ON(irqs_disabled());
+ debug_locks_silent = !(debug_locks_verbose & lockclass_mask);
+
testcase_fn();
/*
* Filter out expected failures:
@@ -1402,7 +1422,7 @@ static void dotest(void (*testcase_fn)(void), int expected, int lockclass_mask)
}
testcase_total++;
- if (debug_locks_verbose)
+ if (debug_locks_verbose & lockclass_mask)
pr_cont(" lockclass mask: %x, debug_locks: %d, expected: %d\n",
lockclass_mask, debug_locks, expected);
/*
@@ -2419,6 +2439,311 @@ static void fs_reclaim_tests(void)
pr_cont("\n");
}
+#define __guard(cleanup) __maybe_unused __attribute__((__cleanup__(cleanup)))
+
+static void hardirq_exit(int *_)
+{
+ HARDIRQ_EXIT();
+}
+
+#define HARDIRQ_CONTEXT(name, ...) \
+ int hardirq_guard_##name __guard(hardirq_exit); \
+ HARDIRQ_ENTER();
+
+#define NOTTHREADED_HARDIRQ_CONTEXT(name, ...) \
+ int notthreaded_hardirq_guard_##name __guard(hardirq_exit); \
+ local_irq_disable(); \
+ __irq_enter(); \
+ WARN_ON(!in_irq());
+
+static void softirq_exit(int *_)
+{
+ SOFTIRQ_EXIT();
+}
+
+#define SOFTIRQ_CONTEXT(name, ...) \
+ int softirq_guard_##name __guard(softirq_exit); \
+ SOFTIRQ_ENTER();
+
+static void rcu_exit(int *_)
+{
+ rcu_read_unlock();
+}
+
+#define RCU_CONTEXT(name, ...) \
+ int rcu_guard_##name __guard(rcu_exit); \
+ rcu_read_lock();
+
+static void rcu_bh_exit(int *_)
+{
+ rcu_read_unlock_bh();
+}
+
+#define RCU_BH_CONTEXT(name, ...) \
+ int rcu_bh_guard_##name __guard(rcu_bh_exit); \
+ rcu_read_lock_bh();
+
+static void rcu_sched_exit(int *_)
+{
+ rcu_read_unlock_sched();
+}
+
+#define RCU_SCHED_CONTEXT(name, ...) \
+ int rcu_sched_guard_##name __guard(rcu_sched_exit); \
+ rcu_read_lock_sched();
+
+static void rcu_callback_exit(int *_)
+{
+ rcu_lock_release(&rcu_callback_map);
+}
+
+#define RCU_CALLBACK_CONTEXT(name, ...) \
+ int rcu_callback_guard_##name __guard(rcu_callback_exit); \
+ rcu_lock_acquire(&rcu_callback_map);
+
+
+static void raw_spinlock_exit(raw_spinlock_t **lock)
+{
+ raw_spin_unlock(*lock);
+}
+
+#define RAW_SPINLOCK_CONTEXT(name, lock) \
+ raw_spinlock_t *raw_spinlock_guard_##name __guard(raw_spinlock_exit) = &(lock); \
+ raw_spin_lock(&(lock));
+
+static void spinlock_exit(spinlock_t **lock)
+{
+ spin_unlock(*lock);
+}
+
+#define SPINLOCK_CONTEXT(name, lock) \
+ spinlock_t *spinlock_guard_##name __guard(spinlock_exit) = &(lock); \
+ spin_lock(&(lock));
+
+static void mutex_exit(struct mutex **lock)
+{
+ mutex_unlock(*lock);
+}
+
+#define MUTEX_CONTEXT(name, lock) \
+ struct mutex *mutex_guard_##name __guard(mutex_exit) = &(lock); \
+ mutex_lock(&(lock));
+
+#define GENERATE_2_CONTEXT_TESTCASE(outer, outer_lock, inner, inner_lock) \
+ \
+static void __maybe_unused inner##_in_##outer(void) \
+{ \
+ outer##_CONTEXT(_, outer_lock); \
+ { \
+ inner##_CONTEXT(_, inner_lock); \
+ } \
+}
+
+/*
+ * wait contexts (considering PREEMPT_RT)
+ *
+ * o: inner is allowed in outer
+ * x: inner is disallowed in outer
+ *
+ * \ inner | RCU | RAW_SPIN | SPIN | MUTEX
+ * outer \ | | | |
+ * ---------------+-------+----------+------+-------
+ * HARDIRQ | o | o | o | x
+ * ---------------+-------+----------+------+-------
+ * NOTTHREADED_IRQ| o | o | x | x
+ * ---------------+-------+----------+------+-------
+ * SOFTIRQ | o | o | o | x
+ * ---------------+-------+----------+------+-------
+ * RCU | o | o | o | x
+ * ---------------+-------+----------+------+-------
+ * RCU_BH | o | o | o | x
+ * ---------------+-------+----------+------+-------
+ * RCU_CALLBACK | o | o | o | x
+ * ---------------+-------+----------+------+-------
+ * RCU_SCHED | o | o | x | x
+ * ---------------+-------+----------+------+-------
+ * RAW_SPIN | o | o | x | x
+ * ---------------+-------+----------+------+-------
+ * SPIN | o | o | o | x
+ * ---------------+-------+----------+------+-------
+ * MUTEX | o | o | o | o
+ * ---------------+-------+----------+------+-------
+ */
+
+#define GENERATE_2_CONTEXT_TESTCASE_FOR_ALL_OUTER(inner, inner_lock) \
+GENERATE_2_CONTEXT_TESTCASE(HARDIRQ, , inner, inner_lock) \
+GENERATE_2_CONTEXT_TESTCASE(NOTTHREADED_HARDIRQ, , inner, inner_lock) \
+GENERATE_2_CONTEXT_TESTCASE(SOFTIRQ, , inner, inner_lock) \
+GENERATE_2_CONTEXT_TESTCASE(RCU, , inner, inner_lock) \
+GENERATE_2_CONTEXT_TESTCASE(RCU_BH, , inner, inner_lock) \
+GENERATE_2_CONTEXT_TESTCASE(RCU_CALLBACK, , inner, inner_lock) \
+GENERATE_2_CONTEXT_TESTCASE(RCU_SCHED, , inner, inner_lock) \
+GENERATE_2_CONTEXT_TESTCASE(RAW_SPINLOCK, raw_lock_A, inner, inner_lock) \
+GENERATE_2_CONTEXT_TESTCASE(SPINLOCK, lock_A, inner, inner_lock) \
+GENERATE_2_CONTEXT_TESTCASE(MUTEX, mutex_A, inner, inner_lock)
+
+GENERATE_2_CONTEXT_TESTCASE_FOR_ALL_OUTER(RCU, )
+GENERATE_2_CONTEXT_TESTCASE_FOR_ALL_OUTER(RAW_SPINLOCK, raw_lock_B)
+GENERATE_2_CONTEXT_TESTCASE_FOR_ALL_OUTER(SPINLOCK, lock_B)
+GENERATE_2_CONTEXT_TESTCASE_FOR_ALL_OUTER(MUTEX, mutex_B)
+
+/* the outer context allows all kinds of preemption */
+#define DO_CONTEXT_TESTCASE_OUTER_PREEMPTIBLE(outer) \
+ dotest(RCU_in_##outer, SUCCESS, LOCKTYPE_RWLOCK); \
+ dotest(RAW_SPINLOCK_in_##outer, SUCCESS, LOCKTYPE_SPIN); \
+ dotest(SPINLOCK_in_##outer, SUCCESS, LOCKTYPE_SPIN); \
+ dotest(MUTEX_in_##outer, SUCCESS, LOCKTYPE_MUTEX); \
+
+/*
+ * the outer context only allows the preemption introduced by spinlock_t (which
+ * is a sleepable lock for PREEMPT_RT)
+ */
+#define DO_CONTEXT_TESTCASE_OUTER_LIMITED_PREEMPTIBLE(outer) \
+ dotest(RCU_in_##outer, SUCCESS, LOCKTYPE_RWLOCK); \
+ dotest(RAW_SPINLOCK_in_##outer, SUCCESS, LOCKTYPE_SPIN); \
+ dotest(SPINLOCK_in_##outer, SUCCESS, LOCKTYPE_SPIN); \
+ dotest(MUTEX_in_##outer, FAILURE, LOCKTYPE_MUTEX); \
+
+/* the outer doesn't allows any kind of preemption */
+#define DO_CONTEXT_TESTCASE_OUTER_NOT_PREEMPTIBLE(outer) \
+ dotest(RCU_in_##outer, SUCCESS, LOCKTYPE_RWLOCK); \
+ dotest(RAW_SPINLOCK_in_##outer, SUCCESS, LOCKTYPE_SPIN); \
+ dotest(SPINLOCK_in_##outer, FAILURE, LOCKTYPE_SPIN); \
+ dotest(MUTEX_in_##outer, FAILURE, LOCKTYPE_MUTEX); \
+
+static void wait_context_tests(void)
+{
+ printk(" --------------------------------------------------------------------------\n");
+ printk(" | wait context tests |\n");
+ printk(" --------------------------------------------------------------------------\n");
+ printk(" | rcu | raw | spin |mutex |\n");
+ printk(" --------------------------------------------------------------------------\n");
+ print_testname("in hardirq context");
+ DO_CONTEXT_TESTCASE_OUTER_LIMITED_PREEMPTIBLE(HARDIRQ);
+ pr_cont("\n");
+
+ print_testname("in hardirq context (not threaded)");
+ DO_CONTEXT_TESTCASE_OUTER_NOT_PREEMPTIBLE(NOTTHREADED_HARDIRQ);
+ pr_cont("\n");
+
+ print_testname("in softirq context");
+ DO_CONTEXT_TESTCASE_OUTER_LIMITED_PREEMPTIBLE(SOFTIRQ);
+ pr_cont("\n");
+
+ print_testname("in RCU context");
+ DO_CONTEXT_TESTCASE_OUTER_LIMITED_PREEMPTIBLE(RCU);
+ pr_cont("\n");
+
+ print_testname("in RCU-bh context");
+ DO_CONTEXT_TESTCASE_OUTER_LIMITED_PREEMPTIBLE(RCU_BH);
+ pr_cont("\n");
+
+ print_testname("in RCU callback context");
+ DO_CONTEXT_TESTCASE_OUTER_LIMITED_PREEMPTIBLE(RCU_CALLBACK);
+ pr_cont("\n");
+
+ print_testname("in RCU-sched context");
+ DO_CONTEXT_TESTCASE_OUTER_NOT_PREEMPTIBLE(RCU_SCHED);
+ pr_cont("\n");
+
+ print_testname("in RAW_SPINLOCK context");
+ DO_CONTEXT_TESTCASE_OUTER_NOT_PREEMPTIBLE(RAW_SPINLOCK);
+ pr_cont("\n");
+
+ print_testname("in SPINLOCK context");
+ DO_CONTEXT_TESTCASE_OUTER_LIMITED_PREEMPTIBLE(SPINLOCK);
+ pr_cont("\n");
+
+ print_testname("in MUTEX context");
+ DO_CONTEXT_TESTCASE_OUTER_PREEMPTIBLE(MUTEX);
+ pr_cont("\n");
+}
+
+static void local_lock_2(void)
+{
+ local_lock_acquire(&local_A); /* IRQ-ON */
+ local_lock_release(&local_A);
+
+ HARDIRQ_ENTER();
+ spin_lock(&lock_A); /* IN-IRQ */
+ spin_unlock(&lock_A);
+ HARDIRQ_EXIT()
+
+ HARDIRQ_DISABLE();
+ spin_lock(&lock_A);
+ local_lock_acquire(&local_A); /* IN-IRQ <-> IRQ-ON cycle, false */
+ local_lock_release(&local_A);
+ spin_unlock(&lock_A);
+ HARDIRQ_ENABLE();
+}
+
+static void local_lock_3A(void)
+{
+ local_lock_acquire(&local_A); /* IRQ-ON */
+ spin_lock(&lock_B); /* IRQ-ON */
+ spin_unlock(&lock_B);
+ local_lock_release(&local_A);
+
+ HARDIRQ_ENTER();
+ spin_lock(&lock_A); /* IN-IRQ */
+ spin_unlock(&lock_A);
+ HARDIRQ_EXIT()
+
+ HARDIRQ_DISABLE();
+ spin_lock(&lock_A);
+ local_lock_acquire(&local_A); /* IN-IRQ <-> IRQ-ON cycle only if we count local_lock(), false */
+ local_lock_release(&local_A);
+ spin_unlock(&lock_A);
+ HARDIRQ_ENABLE();
+}
+
+static void local_lock_3B(void)
+{
+ local_lock_acquire(&local_A); /* IRQ-ON */
+ spin_lock(&lock_B); /* IRQ-ON */
+ spin_unlock(&lock_B);
+ local_lock_release(&local_A);
+
+ HARDIRQ_ENTER();
+ spin_lock(&lock_A); /* IN-IRQ */
+ spin_unlock(&lock_A);
+ HARDIRQ_EXIT()
+
+ HARDIRQ_DISABLE();
+ spin_lock(&lock_A);
+ local_lock_acquire(&local_A); /* IN-IRQ <-> IRQ-ON cycle only if we count local_lock(), false */
+ local_lock_release(&local_A);
+ spin_unlock(&lock_A);
+ HARDIRQ_ENABLE();
+
+ HARDIRQ_DISABLE();
+ spin_lock(&lock_A);
+ spin_lock(&lock_B); /* IN-IRQ <-> IRQ-ON cycle, true */
+ spin_unlock(&lock_B);
+ spin_unlock(&lock_A);
+ HARDIRQ_DISABLE();
+
+}
+
+static void local_lock_tests(void)
+{
+ printk(" --------------------------------------------------------------------------\n");
+ printk(" | local_lock tests |\n");
+ printk(" ---------------------\n");
+
+ print_testname("local_lock inversion 2");
+ dotest(local_lock_2, SUCCESS, LOCKTYPE_LL);
+ pr_cont("\n");
+
+ print_testname("local_lock inversion 3A");
+ dotest(local_lock_3A, SUCCESS, LOCKTYPE_LL);
+ pr_cont("\n");
+
+ print_testname("local_lock inversion 3B");
+ dotest(local_lock_3B, FAILURE, LOCKTYPE_LL);
+ pr_cont("\n");
+}
+
void locking_selftest(void)
{
/*
@@ -2446,7 +2771,6 @@ void locking_selftest(void)
printk(" --------------------------------------------------------------------------\n");
init_shared_classes();
- debug_locks_silent = !debug_locks_verbose;
lockdep_set_selftest_task(current);
DO_TESTCASE_6R("A-A deadlock", AA);
@@ -2542,6 +2866,12 @@ void locking_selftest(void)
fs_reclaim_tests();
+ /* Wait context test cases that are specific for RAW_LOCK_NESTING */
+ if (IS_ENABLED(CONFIG_PROVE_RAW_LOCK_NESTING))
+ wait_context_tests();
+
+ local_lock_tests();
+
if (unexpected_testcase_failures) {
printk("-----------------------------------------------------------------\n");
debug_locks = 0;
diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c
index e59eda07305e6..a1071cdefb5aa 100644
--- a/lib/percpu-refcount.c
+++ b/lib/percpu-refcount.c
@@ -5,6 +5,7 @@
#include <linux/sched.h>
#include <linux/wait.h>
#include <linux/slab.h>
+#include <linux/mm.h>
#include <linux/percpu-refcount.h>
/*
@@ -168,6 +169,7 @@ static void percpu_ref_switch_to_atomic_rcu(struct rcu_head *rcu)
struct percpu_ref_data, rcu);
struct percpu_ref *ref = data->ref;
unsigned long __percpu *percpu_count = percpu_count_ptr(ref);
+ static atomic_t underflows;
unsigned long count = 0;
int cpu;
@@ -191,9 +193,13 @@ static void percpu_ref_switch_to_atomic_rcu(struct rcu_head *rcu)
*/
atomic_long_add((long)count - PERCPU_COUNT_BIAS, &data->count);
- WARN_ONCE(atomic_long_read(&data->count) <= 0,
- "percpu ref (%ps) <= 0 (%ld) after switching to atomic",
- data->release, atomic_long_read(&data->count));
+ if (WARN_ONCE(atomic_long_read(&data->count) <= 0,
+ "percpu ref (%ps) <= 0 (%ld) after switching to atomic",
+ data->release, atomic_long_read(&data->count)) &&
+ atomic_inc_return(&underflows) < 4) {
+ pr_err("%s(): percpu_ref underflow", __func__);
+ mem_dump_obj(data);
+ }
/* @ref is viewed as dead on all CPUs, send out switch confirmation */
percpu_ref_call_confirm_rcu(rcu);
diff --git a/lib/test_fpu.c b/lib/test_fpu.c
index c33764aa3eb8f..e82db19fed84a 100644
--- a/lib/test_fpu.c
+++ b/lib/test_fpu.c
@@ -63,7 +63,7 @@ static int test_fpu_get(void *data, u64 *val)
return status;
}
-DEFINE_SIMPLE_ATTRIBUTE(test_fpu_fops, test_fpu_get, NULL, "%lld\n");
+DEFINE_DEBUGFS_ATTRIBUTE(test_fpu_fops, test_fpu_get, NULL, "%lld\n");
static struct dentry *selftest_dir;
static int __init test_fpu_init(void)
@@ -72,8 +72,8 @@ static int __init test_fpu_init(void)
if (!selftest_dir)
return -ENOMEM;
- debugfs_create_file("test_fpu", 0444, selftest_dir, NULL,
- &test_fpu_fops);
+ debugfs_create_file_unsafe("test_fpu", 0444, selftest_dir, NULL,
+ &test_fpu_fops);
return 0;
}
diff --git a/lib/timerqueue.c b/lib/timerqueue.c
index c527109645930..cdb9c7658478f 100644
--- a/lib/timerqueue.c
+++ b/lib/timerqueue.c
@@ -14,6 +14,14 @@
#include <linux/rbtree.h>
#include <linux/export.h>
+#define __node_2_tq(_n) \
+ rb_entry((_n), struct timerqueue_node, node)
+
+static inline bool __timerqueue_less(struct rb_node *a, const struct rb_node *b)
+{
+ return __node_2_tq(a)->expires < __node_2_tq(b)->expires;
+}
+
/**
* timerqueue_add - Adds timer to timerqueue.
*
@@ -26,28 +34,10 @@
*/
bool timerqueue_add(struct timerqueue_head *head, struct timerqueue_node *node)
{
- struct rb_node **p = &head->rb_root.rb_root.rb_node;
- struct rb_node *parent = NULL;
- struct timerqueue_node *ptr;
- bool leftmost = true;
-
/* Make sure we don't add nodes that are already added */
WARN_ON_ONCE(!RB_EMPTY_NODE(&node->node));
- while (*p) {
- parent = *p;
- ptr = rb_entry(parent, struct timerqueue_node, node);
- if (node->expires < ptr->expires) {
- p = &(*p)->rb_left;
- } else {
- p = &(*p)->rb_right;
- leftmost = false;
- }
- }
- rb_link_node(&node->node, parent, p);
- rb_insert_color_cached(&node->node, &head->rb_root, leftmost);
-
- return leftmost;
+ return rb_add_cached(&node->node, &head->rb_root, __timerqueue_less);
}
EXPORT_SYMBOL_GPL(timerqueue_add);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 4bdb58ab14cbb..905a7d549b00f 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -4008,25 +4008,11 @@ void __unmap_hugepage_range_final(struct mmu_gather *tlb,
void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
unsigned long end, struct page *ref_page)
{
- struct mm_struct *mm;
struct mmu_gather tlb;
- unsigned long tlb_start = start;
- unsigned long tlb_end = end;
- /*
- * If shared PMDs were possibly used within this vma range, adjust
- * start/end for worst case tlb flushing.
- * Note that we can not be sure if PMDs are shared until we try to
- * unmap pages. However, we want to make sure TLB flushing covers
- * the largest possible range.
- */
- adjust_range_if_pmd_sharing_possible(vma, &tlb_start, &tlb_end);
-
- mm = vma->vm_mm;
-
- tlb_gather_mmu(&tlb, mm, tlb_start, tlb_end);
+ tlb_gather_mmu(&tlb, vma->vm_mm);
__unmap_hugepage_range(&tlb, vma, start, end, ref_page);
- tlb_finish_mmu(&tlb, tlb_start, tlb_end);
+ tlb_finish_mmu(&tlb);
}
/*
diff --git a/mm/madvise.c b/mm/madvise.c
index d4f5eece9d56b..df692d2e35d4a 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -506,9 +506,9 @@ static long madvise_cold(struct vm_area_struct *vma,
return -EINVAL;
lru_add_drain();
- tlb_gather_mmu(&tlb, mm, start_addr, end_addr);
+ tlb_gather_mmu(&tlb, mm);
madvise_cold_page_range(&tlb, vma, start_addr, end_addr);
- tlb_finish_mmu(&tlb, start_addr, end_addr);
+ tlb_finish_mmu(&tlb);
return 0;
}
@@ -559,9 +559,9 @@ static long madvise_pageout(struct vm_area_struct *vma,
return 0;
lru_add_drain();
- tlb_gather_mmu(&tlb, mm, start_addr, end_addr);
+ tlb_gather_mmu(&tlb, mm);
madvise_pageout_page_range(&tlb, vma, start_addr, end_addr);
- tlb_finish_mmu(&tlb, start_addr, end_addr);
+ tlb_finish_mmu(&tlb);
return 0;
}
@@ -724,7 +724,7 @@ static int madvise_free_single_vma(struct vm_area_struct *vma,
range.start, range.end);
lru_add_drain();
- tlb_gather_mmu(&tlb, mm, range.start, range.end);
+ tlb_gather_mmu(&tlb, mm);
update_hiwater_rss(mm);
mmu_notifier_invalidate_range_start(&range);
@@ -733,7 +733,7 @@ static int madvise_free_single_vma(struct vm_area_struct *vma,
&madvise_free_walk_ops, &tlb);
tlb_end_vma(&tlb, vma);
mmu_notifier_invalidate_range_end(&range);
- tlb_finish_mmu(&tlb, range.start, range.end);
+ tlb_finish_mmu(&tlb);
return 0;
}
diff --git a/mm/memory.c b/mm/memory.c
index 7b13078733252..b32f32bf584de 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1546,13 +1546,13 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long start,
lru_add_drain();
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
start, start + size);
- tlb_gather_mmu(&tlb, vma->vm_mm, start, range.end);
+ tlb_gather_mmu(&tlb, vma->vm_mm);
update_hiwater_rss(vma->vm_mm);
mmu_notifier_invalidate_range_start(&range);
for ( ; vma && vma->vm_start < range.end; vma = vma->vm_next)
unmap_single_vma(&tlb, vma, start, range.end, NULL);
mmu_notifier_invalidate_range_end(&range);
- tlb_finish_mmu(&tlb, start, range.end);
+ tlb_finish_mmu(&tlb);
}
/**
@@ -1573,12 +1573,12 @@ static void zap_page_range_single(struct vm_area_struct *vma, unsigned long addr
lru_add_drain();
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
address, address + size);
- tlb_gather_mmu(&tlb, vma->vm_mm, address, range.end);
+ tlb_gather_mmu(&tlb, vma->vm_mm);
update_hiwater_rss(vma->vm_mm);
mmu_notifier_invalidate_range_start(&range);
unmap_single_vma(&tlb, vma, address, range.end, details);
mmu_notifier_invalidate_range_end(&range);
- tlb_finish_mmu(&tlb, address, range.end);
+ tlb_finish_mmu(&tlb);
}
/**
diff --git a/mm/mmap.c b/mm/mmap.c
index dc7206032387c..90673febce6aa 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2671,12 +2671,12 @@ static void unmap_region(struct mm_struct *mm,
struct mmu_gather tlb;
lru_add_drain();
- tlb_gather_mmu(&tlb, mm, start, end);
+ tlb_gather_mmu(&tlb, mm);
update_hiwater_rss(mm);
unmap_vmas(&tlb, vma, start, end);
free_pgtables(&tlb, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS,
next ? next->vm_start : USER_PGTABLES_CEILING);
- tlb_finish_mmu(&tlb, start, end);
+ tlb_finish_mmu(&tlb);
}
/*
@@ -3214,12 +3214,12 @@ void exit_mmap(struct mm_struct *mm)
lru_add_drain();
flush_cache_mm(mm);
- tlb_gather_mmu(&tlb, mm, 0, -1);
+ tlb_gather_mmu_fullmm(&tlb, mm);
/* update_hiwater_rss(mm) here? but nobody should be looking */
/* Use -1 here to ensure all VMAs in the mm are unmapped */
unmap_vmas(&tlb, vma, 0, -1);
free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING);
- tlb_finish_mmu(&tlb, 0, -1);
+ tlb_finish_mmu(&tlb);
/*
* Walk the list again, actually closing and freeing it,
diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c
index 03c33c93a582b..0dc7149b0c615 100644
--- a/mm/mmu_gather.c
+++ b/mm/mmu_gather.c
@@ -253,21 +253,17 @@ void tlb_flush_mmu(struct mmu_gather *tlb)
* tlb_gather_mmu - initialize an mmu_gather structure for page-table tear-down
* @tlb: the mmu_gather structure to initialize
* @mm: the mm_struct of the target address space
- * @start: start of the region that will be removed from the page-table
- * @end: end of the region that will be removed from the page-table
+ * @fullmm: @mm is without users and we're going to destroy the full address
+ * space (exit/execve)
*
* Called to initialize an (on-stack) mmu_gather structure for page-table
- * tear-down from @mm. The @start and @end are set to 0 and -1
- * respectively when @mm is without users and we're going to destroy
- * the full address space (exit/execve).
+ * tear-down from @mm.
*/
-void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
- unsigned long start, unsigned long end)
+static void __tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
+ bool fullmm)
{
tlb->mm = mm;
-
- /* Is it from 0 to ~0? */
- tlb->fullmm = !(start | (end+1));
+ tlb->fullmm = fullmm;
#ifndef CONFIG_MMU_GATHER_NO_GATHER
tlb->need_flush_all = 0;
@@ -287,17 +283,24 @@ void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
inc_tlb_flush_pending(tlb->mm);
}
+void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm)
+{
+ __tlb_gather_mmu(tlb, mm, false);
+}
+
+void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm)
+{
+ __tlb_gather_mmu(tlb, mm, true);
+}
+
/**
* tlb_finish_mmu - finish an mmu_gather structure
* @tlb: the mmu_gather structure to finish
- * @start: start of the region that will be removed from the page-table
- * @end: end of the region that will be removed from the page-table
*
* Called at the end of the shootdown operation to free up any resources that
* were required.
*/
-void tlb_finish_mmu(struct mmu_gather *tlb,
- unsigned long start, unsigned long end)
+void tlb_finish_mmu(struct mmu_gather *tlb)
{
/*
* If there are parallel threads are doing PTE changes on same range
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 04b19b7b5435b..c9a33ffe38b7c 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -546,15 +546,15 @@ bool __oom_reap_task_mm(struct mm_struct *mm)
mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0,
vma, mm, vma->vm_start,
vma->vm_end);
- tlb_gather_mmu(&tlb, mm, range.start, range.end);
+ tlb_gather_mmu(&tlb, mm);
if (mmu_notifier_invalidate_range_start_nonblock(&range)) {
- tlb_finish_mmu(&tlb, range.start, range.end);
+ tlb_finish_mmu(&tlb);
ret = false;
continue;
}
unmap_page_range(&tlb, vma, range.start, range.end, NULL);
mmu_notifier_invalidate_range_end(&range);
- tlb_finish_mmu(&tlb, range.start, range.end);
+ tlb_finish_mmu(&tlb);
}
}
diff --git a/mm/slab.c b/mm/slab.c
index d7c8da9319c78..dcc55e78f3534 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3635,6 +3635,26 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t flags,
EXPORT_SYMBOL(__kmalloc_node_track_caller);
#endif /* CONFIG_NUMA */
+void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page)
+{
+ struct kmem_cache *cachep;
+ unsigned int objnr;
+ void *objp;
+
+ kpp->kp_ptr = object;
+ kpp->kp_page = page;
+ cachep = page->slab_cache;
+ kpp->kp_slab_cache = cachep;
+ objp = object - obj_offset(cachep);
+ kpp->kp_data_offset = obj_offset(cachep);
+ page = virt_to_head_page(objp);
+ objnr = obj_to_index(cachep, page, objp);
+ objp = index_to_obj(cachep, page, objnr);
+ kpp->kp_objp = objp;
+ if (DEBUG && cachep->flags & SLAB_STORE_USER)
+ kpp->kp_ret = *dbg_userword(cachep, objp);
+}
+
/**
* __do_kmalloc - allocate memory
* @size: how many bytes of memory are required.
diff --git a/mm/slab.h b/mm/slab.h
index 1a756a359fa8b..ecad9b57bc441 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -615,4 +615,16 @@ static inline bool slab_want_init_on_free(struct kmem_cache *c)
return false;
}
+#define KS_ADDRS_COUNT 16
+struct kmem_obj_info {
+ void *kp_ptr;
+ struct page *kp_page;
+ void *kp_objp;
+ unsigned long kp_data_offset;
+ struct kmem_cache *kp_slab_cache;
+ void *kp_ret;
+ void *kp_stack[KS_ADDRS_COUNT];
+};
+void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page);
+
#endif /* MM_SLAB_H */
diff --git a/mm/slab_common.c b/mm/slab_common.c
index e981c80d216c2..adbace4256efb 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -537,6 +537,81 @@ bool slab_is_available(void)
return slab_state >= UP;
}
+/**
+ * kmem_valid_obj - does the pointer reference a valid slab object?
+ * @object: pointer to query.
+ *
+ * Return: %true if the pointer is to a not-yet-freed object from
+ * kmalloc() or kmem_cache_alloc(), either %true or %false if the pointer
+ * is to an already-freed object, and %false otherwise.
+ */
+bool kmem_valid_obj(void *object)
+{
+ struct page *page;
+
+ /* Some arches consider ZERO_SIZE_PTR to be a valid address. */
+ if (object < (void *)PAGE_SIZE || !virt_addr_valid(object))
+ return false;
+ page = virt_to_head_page(object);
+ return PageSlab(page);
+}
+
+/**
+ * kmem_dump_obj - Print available slab provenance information
+ * @object: slab object for which to find provenance information.
+ *
+ * This function uses pr_cont(), so that the caller is expected to have
+ * printed out whatever preamble is appropriate. The provenance information
+ * depends on the type of object and on how much debugging is enabled.
+ * For a slab-cache object, the fact that it is a slab object is printed,
+ * and, if available, the slab name, return address, and stack trace from
+ * the allocation of that object.
+ *
+ * This function will splat if passed a pointer to a non-slab object.
+ * If you are not sure what type of object you have, you should instead
+ * use mem_dump_obj().
+ */
+void kmem_dump_obj(void *object)
+{
+ char *cp = IS_ENABLED(CONFIG_MMU) ? "" : "/vmalloc";
+ int i;
+ struct page *page;
+ unsigned long ptroffset;
+ struct kmem_obj_info kp = { };
+
+ if (WARN_ON_ONCE(!virt_addr_valid(object)))
+ return;
+ page = virt_to_head_page(object);
+ if (WARN_ON_ONCE(!PageSlab(page))) {
+ pr_cont(" non-slab memory.\n");
+ return;
+ }
+ kmem_obj_info(&kp, object, page);
+ if (kp.kp_slab_cache)
+ pr_cont(" slab%s %s", cp, kp.kp_slab_cache->name);
+ else
+ pr_cont(" slab%s", cp);
+ if (kp.kp_objp)
+ pr_cont(" start %px", kp.kp_objp);
+ if (kp.kp_data_offset)
+ pr_cont(" data offset %lu", kp.kp_data_offset);
+ if (kp.kp_objp) {
+ ptroffset = ((char *)object - (char *)kp.kp_objp) - kp.kp_data_offset;
+ pr_cont(" pointer offset %lu", ptroffset);
+ }
+ if (kp.kp_slab_cache && kp.kp_slab_cache->usersize)
+ pr_cont(" size %u", kp.kp_slab_cache->usersize);
+ if (kp.kp_ret)
+ pr_cont(" allocated at %pS\n", kp.kp_ret);
+ else
+ pr_cont("\n");
+ for (i = 0; i < ARRAY_SIZE(kp.kp_stack); i++) {
+ if (!kp.kp_stack[i])
+ break;
+ pr_info(" %pS\n", kp.kp_stack[i]);
+ }
+}
+
#ifndef CONFIG_SLOB
/* Create a cache during boot when no slab services are available yet */
void __init create_boot_cache(struct kmem_cache *s, const char *name,
diff --git a/mm/slob.c b/mm/slob.c
index 8d4bfa46247f4..ef87ada8705d8 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -461,6 +461,12 @@ out:
spin_unlock_irqrestore(&slob_lock, flags);
}
+void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page)
+{
+ kpp->kp_ptr = object;
+ kpp->kp_page = page;
+}
+
/*
* End of slob allocator proper. Begin kmem_cache_alloc and kmalloc frontend.
*/
diff --git a/mm/slub.c b/mm/slub.c
index b22a4b101c846..f5baf429654f2 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -3933,6 +3933,46 @@ int __kmem_cache_shutdown(struct kmem_cache *s)
return 0;
}
+void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page)
+{
+ void *base;
+ int __maybe_unused i;
+ unsigned int objnr;
+ void *objp;
+ void *objp0;
+ struct kmem_cache *s = page->slab_cache;
+ struct track __maybe_unused *trackp;
+
+ kpp->kp_ptr = object;
+ kpp->kp_page = page;
+ kpp->kp_slab_cache = s;
+ base = page_address(page);
+ objp0 = kasan_reset_tag(object);
+#ifdef CONFIG_SLUB_DEBUG
+ objp = restore_red_left(s, objp0);
+#else
+ objp = objp0;
+#endif
+ objnr = obj_to_index(s, page, objp);
+ kpp->kp_data_offset = (unsigned long)((char *)objp0 - (char *)objp);
+ objp = base + s->size * objnr;
+ kpp->kp_objp = objp;
+ if (WARN_ON_ONCE(objp < base || objp >= base + page->objects * s->size || (objp - base) % s->size) ||
+ !(s->flags & SLAB_STORE_USER))
+ return;
+#ifdef CONFIG_SLUB_DEBUG
+ trackp = get_track(s, objp, TRACK_ALLOC);
+ kpp->kp_ret = (void *)trackp->addr;
+#ifdef CONFIG_STACKTRACE
+ for (i = 0; i < KS_ADDRS_COUNT && i < TRACK_ADDRS_COUNT; i++) {
+ kpp->kp_stack[i] = (void *)trackp->addrs[i];
+ if (!kpp->kp_stack[i])
+ break;
+ }
+#endif
+#endif
+}
+
/********************************************************************
* Kmalloc subsystem
*******************************************************************/
diff --git a/mm/util.c b/mm/util.c
index 8c9b7d1e7c499..54870226cea64 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -982,3 +982,34 @@ int __weak memcmp_pages(struct page *page1, struct page *page2)
kunmap_atomic(addr1);
return ret;
}
+
+/**
+ * mem_dump_obj - Print available provenance information
+ * @object: object for which to find provenance information.
+ *
+ * This function uses pr_cont(), so that the caller is expected to have
+ * printed out whatever preamble is appropriate. The provenance information
+ * depends on the type of object and on how much debugging is enabled.
+ * For example, for a slab-cache object, the slab name is printed, and,
+ * if available, the return address and stack trace from the allocation
+ * of that object.
+ */
+void mem_dump_obj(void *object)
+{
+ if (kmem_valid_obj(object)) {
+ kmem_dump_obj(object);
+ return;
+ }
+ if (vmalloc_dump_obj(object))
+ return;
+ if (!virt_addr_valid(object)) {
+ if (object == NULL)
+ pr_cont(" NULL pointer.\n");
+ else if (object == ZERO_SIZE_PTR)
+ pr_cont(" zero-size pointer.\n");
+ else
+ pr_cont(" non-paged memory.\n");
+ return;
+ }
+ pr_cont(" non-slab/vmalloc memory.\n");
+}
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index e6f352bf04982..4f5f8c907897a 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -3450,6 +3450,19 @@ void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms)
}
#endif /* CONFIG_SMP */
+bool vmalloc_dump_obj(void *object)
+{
+ struct vm_struct *vm;
+ void *objp = (void *)PAGE_ALIGN((unsigned long)object);
+
+ vm = find_vm_area(objp);
+ if (!vm)
+ return false;
+ pr_cont(" %u-page vmalloc region starting at %#lx allocated at %pS\n",
+ vm->nr_pages, (unsigned long)vm->addr, vm->caller);
+ return true;
+}
+
#ifdef CONFIG_PROC_FS
static void *s_start(struct seq_file *m, loff_t *pos)
__acquires(&vmap_purge_lock)
diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index 5b47158f52750..ca4201753d5ef 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -7060,12 +7060,6 @@ sub process {
}
}
-# check for mutex_trylock_recursive usage
- if ($line =~ /mutex_trylock_recursive/) {
- ERROR("LOCKING",
- "recursive locking is bad, do not use this ever.\n" . $herecurr);
- }
-
# check for lockdep_set_novalidate_class
if ($line =~ /^.\s*lockdep_set_novalidate_class\s*\(/ ||
$line =~ /__lockdep_no_validate__\s*\)/ ) {
diff --git a/tools/arch/x86/include/asm/disabled-features.h b/tools/arch/x86/include/asm/disabled-features.h
index 7947cb1782daf..b7dd944dc8673 100644
--- a/tools/arch/x86/include/asm/disabled-features.h
+++ b/tools/arch/x86/include/asm/disabled-features.h
@@ -91,6 +91,7 @@
DISABLE_ENQCMD)
#define DISABLED_MASK17 0
#define DISABLED_MASK18 0
-#define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 19)
+#define DISABLED_MASK19 0
+#define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 20)
#endif /* _ASM_X86_DISABLED_FEATURES_H */
diff --git a/tools/arch/x86/include/asm/insn.h b/tools/arch/x86/include/asm/insn.h
index 52c6262e6bfd1..cc777c185212d 100644
--- a/tools/arch/x86/include/asm/insn.h
+++ b/tools/arch/x86/include/asm/insn.h
@@ -7,9 +7,12 @@
* Copyright (C) IBM Corporation, 2009
*/
+#include <asm/byteorder.h>
/* insn_attr_t is defined in inat.h */
#include "inat.h"
+#if defined(__BYTE_ORDER) ? __BYTE_ORDER == __LITTLE_ENDIAN : defined(__LITTLE_ENDIAN)
+
struct insn_field {
union {
insn_value_t value;
@@ -20,6 +23,48 @@ struct insn_field {
unsigned char nbytes;
};
+static inline void insn_field_set(struct insn_field *p, insn_value_t v,
+ unsigned char n)
+{
+ p->value = v;
+ p->nbytes = n;
+}
+
+static inline void insn_set_byte(struct insn_field *p, unsigned char n,
+ insn_byte_t v)
+{
+ p->bytes[n] = v;
+}
+
+#else
+
+struct insn_field {
+ insn_value_t value;
+ union {
+ insn_value_t little;
+ insn_byte_t bytes[4];
+ };
+ /* !0 if we've run insn_get_xxx() for this field */
+ unsigned char got;
+ unsigned char nbytes;
+};
+
+static inline void insn_field_set(struct insn_field *p, insn_value_t v,
+ unsigned char n)
+{
+ p->value = v;
+ p->little = __cpu_to_le32(v);
+ p->nbytes = n;
+}
+
+static inline void insn_set_byte(struct insn_field *p, unsigned char n,
+ insn_byte_t v)
+{
+ p->bytes[n] = v;
+ p->value = __le32_to_cpu(p->little);
+}
+#endif
+
struct insn {
struct insn_field prefixes; /*
* Prefixes
diff --git a/tools/arch/x86/include/asm/orc_types.h b/tools/arch/x86/include/asm/orc_types.h
index fdbffec4cfdea..5a2baf28a1dcd 100644
--- a/tools/arch/x86/include/asm/orc_types.h
+++ b/tools/arch/x86/include/asm/orc_types.h
@@ -40,6 +40,8 @@
#define ORC_REG_MAX 15
#ifndef __ASSEMBLY__
+#include <asm/byteorder.h>
+
/*
* This struct is more or less a vastly simplified version of the DWARF Call
* Frame Information standard. It contains only the necessary parts of DWARF
@@ -51,10 +53,18 @@
struct orc_entry {
s16 sp_offset;
s16 bp_offset;
+#if defined(__LITTLE_ENDIAN_BITFIELD)
unsigned sp_reg:4;
unsigned bp_reg:4;
unsigned type:2;
unsigned end:1;
+#elif defined(__BIG_ENDIAN_BITFIELD)
+ unsigned bp_reg:4;
+ unsigned sp_reg:4;
+ unsigned unused:5;
+ unsigned end:1;
+ unsigned type:2;
+#endif
} __packed;
#endif /* __ASSEMBLY__ */
diff --git a/tools/arch/x86/include/asm/required-features.h b/tools/arch/x86/include/asm/required-features.h
index 3ff0d48469f28..b2d504f119370 100644
--- a/tools/arch/x86/include/asm/required-features.h
+++ b/tools/arch/x86/include/asm/required-features.h
@@ -101,6 +101,7 @@
#define REQUIRED_MASK16 0
#define REQUIRED_MASK17 0
#define REQUIRED_MASK18 0
-#define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 19)
+#define REQUIRED_MASK19 0
+#define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 20)
#endif /* _ASM_X86_REQUIRED_FEATURES_H */
diff --git a/tools/arch/x86/lib/insn.c b/tools/arch/x86/lib/insn.c
index 0151dfc6da616..3d9355ed12463 100644
--- a/tools/arch/x86/lib/insn.c
+++ b/tools/arch/x86/lib/insn.c
@@ -5,6 +5,7 @@
* Copyright (C) IBM Corporation, 2002, 2004, 2009
*/
+#include <linux/kernel.h>
#ifdef __KERNEL__
#include <linux/string.h>
#else
@@ -15,15 +16,28 @@
#include "../include/asm/emulate_prefix.h"
+#define leXX_to_cpu(t, r) \
+({ \
+ __typeof__(t) v; \
+ switch (sizeof(t)) { \
+ case 4: v = le32_to_cpu(r); break; \
+ case 2: v = le16_to_cpu(r); break; \
+ case 1: v = r; break; \
+ default: \
+ BUILD_BUG(); break; \
+ } \
+ v; \
+})
+
/* Verify next sizeof(t) bytes can be on the same instruction */
#define validate_next(t, insn, n) \
((insn)->next_byte + sizeof(t) + n <= (insn)->end_kaddr)
#define __get_next(t, insn) \
- ({ t r = *(t*)insn->next_byte; insn->next_byte += sizeof(t); r; })
+ ({ t r = *(t*)insn->next_byte; insn->next_byte += sizeof(t); leXX_to_cpu(t, r); })
#define __peek_nbyte_next(t, insn, n) \
- ({ t r = *(t*)((insn)->next_byte + n); r; })
+ ({ t r = *(t*)((insn)->next_byte + n); leXX_to_cpu(t, r); })
#define get_next(t, insn) \
({ if (unlikely(!validate_next(t, insn, 0))) goto err_out; __get_next(t, insn); })
@@ -147,9 +161,9 @@ found:
b = insn->prefixes.bytes[3];
for (i = 0; i < nb; i++)
if (prefixes->bytes[i] == lb)
- prefixes->bytes[i] = b;
+ insn_set_byte(prefixes, i, b);
}
- insn->prefixes.bytes[3] = lb;
+ insn_set_byte(&insn->prefixes, 3, lb);
}
/* Decode REX prefix */
@@ -157,8 +171,7 @@ found:
b = peek_next(insn_byte_t, insn);
attr = inat_get_opcode_attribute(b);
if (inat_is_rex_prefix(attr)) {
- insn->rex_prefix.value = b;
- insn->rex_prefix.nbytes = 1;
+ insn_field_set(&insn->rex_prefix, b, 1);
insn->next_byte++;
if (X86_REX_W(b))
/* REX.W overrides opnd_size */
@@ -181,13 +194,13 @@ found:
if (X86_MODRM_MOD(b2) != 3)
goto vex_end;
}
- insn->vex_prefix.bytes[0] = b;
- insn->vex_prefix.bytes[1] = b2;
+ insn_set_byte(&insn->vex_prefix, 0, b);
+ insn_set_byte(&insn->vex_prefix, 1, b2);
if (inat_is_evex_prefix(attr)) {
b2 = peek_nbyte_next(insn_byte_t, insn, 2);
- insn->vex_prefix.bytes[2] = b2;
+ insn_set_byte(&insn->vex_prefix, 2, b2);
b2 = peek_nbyte_next(insn_byte_t, insn, 3);
- insn->vex_prefix.bytes[3] = b2;
+ insn_set_byte(&insn->vex_prefix, 3, b2);
insn->vex_prefix.nbytes = 4;
insn->next_byte += 4;
if (insn->x86_64 && X86_VEX_W(b2))
@@ -195,7 +208,7 @@ found:
insn->opnd_bytes = 8;
} else if (inat_is_vex3_prefix(attr)) {
b2 = peek_nbyte_next(insn_byte_t, insn, 2);
- insn->vex_prefix.bytes[2] = b2;
+ insn_set_byte(&insn->vex_prefix, 2, b2);
insn->vex_prefix.nbytes = 3;
insn->next_byte += 3;
if (insn->x86_64 && X86_VEX_W(b2))
@@ -207,7 +220,7 @@ found:
* Makes it easier to decode vex.W, vex.vvvv,
* vex.L and vex.pp. Masking with 0x7f sets vex.W == 0.
*/
- insn->vex_prefix.bytes[2] = b2 & 0x7f;
+ insn_set_byte(&insn->vex_prefix, 2, b2 & 0x7f);
insn->vex_prefix.nbytes = 2;
insn->next_byte += 2;
}
@@ -243,7 +256,7 @@ void insn_get_opcode(struct insn *insn)
/* Get first opcode */
op = get_next(insn_byte_t, insn);
- opcode->bytes[0] = op;
+ insn_set_byte(opcode, 0, op);
opcode->nbytes = 1;
/* Check if there is VEX prefix or not */
@@ -295,8 +308,7 @@ void insn_get_modrm(struct insn *insn)
if (inat_has_modrm(insn->attr)) {
mod = get_next(insn_byte_t, insn);
- modrm->value = mod;
- modrm->nbytes = 1;
+ insn_field_set(modrm, mod, 1);
if (inat_is_group(insn->attr)) {
pfx_id = insn_last_prefix_id(insn);
insn->attr = inat_get_group_attribute(mod, pfx_id,
@@ -334,7 +346,7 @@ int insn_rip_relative(struct insn *insn)
* For rip-relative instructions, the mod field (top 2 bits)
* is zero and the r/m field (bottom 3 bits) is 0x5.
*/
- return (modrm->nbytes && (modrm->value & 0xc7) == 0x5);
+ return (modrm->nbytes && (modrm->bytes[0] & 0xc7) == 0x5);
}
/**
@@ -353,11 +365,11 @@ void insn_get_sib(struct insn *insn)
if (!insn->modrm.got)
insn_get_modrm(insn);
if (insn->modrm.nbytes) {
- modrm = (insn_byte_t)insn->modrm.value;
+ modrm = insn->modrm.bytes[0];
if (insn->addr_bytes != 2 &&
X86_MODRM_MOD(modrm) != 3 && X86_MODRM_RM(modrm) == 4) {
- insn->sib.value = get_next(insn_byte_t, insn);
- insn->sib.nbytes = 1;
+ insn_field_set(&insn->sib,
+ get_next(insn_byte_t, insn), 1);
}
}
insn->sib.got = 1;
@@ -407,19 +419,18 @@ void insn_get_displacement(struct insn *insn)
if (mod == 3)
goto out;
if (mod == 1) {
- insn->displacement.value = get_next(signed char, insn);
- insn->displacement.nbytes = 1;
+ insn_field_set(&insn->displacement,
+ get_next(signed char, insn), 1);
} else if (insn->addr_bytes == 2) {
if ((mod == 0 && rm == 6) || mod == 2) {
- insn->displacement.value =
- get_next(short, insn);
- insn->displacement.nbytes = 2;
+ insn_field_set(&insn->displacement,
+ get_next(short, insn), 2);
}
} else {
if ((mod == 0 && rm == 5) || mod == 2 ||
(mod == 0 && base == 5)) {
- insn->displacement.value = get_next(int, insn);
- insn->displacement.nbytes = 4;
+ insn_field_set(&insn->displacement,
+ get_next(int, insn), 4);
}
}
}
@@ -435,18 +446,14 @@ static int __get_moffset(struct insn *insn)
{
switch (insn->addr_bytes) {
case 2:
- insn->moffset1.value = get_next(short, insn);
- insn->moffset1.nbytes = 2;
+ insn_field_set(&insn->moffset1, get_next(short, insn), 2);
break;
case 4:
- insn->moffset1.value = get_next(int, insn);
- insn->moffset1.nbytes = 4;
+ insn_field_set(&insn->moffset1, get_next(int, insn), 4);
break;
case 8:
- insn->moffset1.value = get_next(int, insn);
- insn->moffset1.nbytes = 4;
- insn->moffset2.value = get_next(int, insn);
- insn->moffset2.nbytes = 4;
+ insn_field_set(&insn->moffset1, get_next(int, insn), 4);
+ insn_field_set(&insn->moffset2, get_next(int, insn), 4);
break;
default: /* opnd_bytes must be modified manually */
goto err_out;
@@ -464,13 +471,11 @@ static int __get_immv32(struct insn *insn)
{
switch (insn->opnd_bytes) {
case 2:
- insn->immediate.value = get_next(short, insn);
- insn->immediate.nbytes = 2;
+ insn_field_set(&insn->immediate, get_next(short, insn), 2);
break;
case 4:
case 8:
- insn->immediate.value = get_next(int, insn);
- insn->immediate.nbytes = 4;
+ insn_field_set(&insn->immediate, get_next(int, insn), 4);
break;
default: /* opnd_bytes must be modified manually */
goto err_out;
@@ -487,18 +492,15 @@ static int __get_immv(struct insn *insn)
{
switch (insn->opnd_bytes) {
case 2:
- insn->immediate1.value = get_next(short, insn);
- insn->immediate1.nbytes = 2;
+ insn_field_set(&insn->immediate1, get_next(short, insn), 2);
break;
case 4:
- insn->immediate1.value = get_next(int, insn);
+ insn_field_set(&insn->immediate1, get_next(int, insn), 4);
insn->immediate1.nbytes = 4;
break;
case 8:
- insn->immediate1.value = get_next(int, insn);
- insn->immediate1.nbytes = 4;
- insn->immediate2.value = get_next(int, insn);
- insn->immediate2.nbytes = 4;
+ insn_field_set(&insn->immediate1, get_next(int, insn), 4);
+ insn_field_set(&insn->immediate2, get_next(int, insn), 4);
break;
default: /* opnd_bytes must be modified manually */
goto err_out;
@@ -515,12 +517,10 @@ static int __get_immptr(struct insn *insn)
{
switch (insn->opnd_bytes) {
case 2:
- insn->immediate1.value = get_next(short, insn);
- insn->immediate1.nbytes = 2;
+ insn_field_set(&insn->immediate1, get_next(short, insn), 2);
break;
case 4:
- insn->immediate1.value = get_next(int, insn);
- insn->immediate1.nbytes = 4;
+ insn_field_set(&insn->immediate1, get_next(int, insn), 4);
break;
case 8:
/* ptr16:64 is not exist (no segment) */
@@ -528,8 +528,7 @@ static int __get_immptr(struct insn *insn)
default: /* opnd_bytes must be modified manually */
goto err_out;
}
- insn->immediate2.value = get_next(unsigned short, insn);
- insn->immediate2.nbytes = 2;
+ insn_field_set(&insn->immediate2, get_next(unsigned short, insn), 2);
insn->immediate1.got = insn->immediate2.got = 1;
return 1;
@@ -565,22 +564,17 @@ void insn_get_immediate(struct insn *insn)
switch (inat_immediate_size(insn->attr)) {
case INAT_IMM_BYTE:
- insn->immediate.value = get_next(signed char, insn);
- insn->immediate.nbytes = 1;
+ insn_field_set(&insn->immediate, get_next(signed char, insn), 1);
break;
case INAT_IMM_WORD:
- insn->immediate.value = get_next(short, insn);
- insn->immediate.nbytes = 2;
+ insn_field_set(&insn->immediate, get_next(short, insn), 2);
break;
case INAT_IMM_DWORD:
- insn->immediate.value = get_next(int, insn);
- insn->immediate.nbytes = 4;
+ insn_field_set(&insn->immediate, get_next(int, insn), 4);
break;
case INAT_IMM_QWORD:
- insn->immediate1.value = get_next(int, insn);
- insn->immediate1.nbytes = 4;
- insn->immediate2.value = get_next(int, insn);
- insn->immediate2.nbytes = 4;
+ insn_field_set(&insn->immediate1, get_next(int, insn), 4);
+ insn_field_set(&insn->immediate2, get_next(int, insn), 4);
break;
case INAT_IMM_PTR:
if (!__get_immptr(insn))
@@ -599,8 +593,7 @@ void insn_get_immediate(struct insn *insn)
goto err_out;
}
if (inat_has_second_immediate(insn->attr)) {
- insn->immediate2.value = get_next(signed char, insn);
- insn->immediate2.nbytes = 1;
+ insn_field_set(&insn->immediate2, get_next(signed char, insn), 1);
}
done:
insn->immediate.got = 1;
diff --git a/tools/include/linux/objtool.h b/tools/include/linux/objtool.h
index 577f51436cf92..7e72d975cb761 100644
--- a/tools/include/linux/objtool.h
+++ b/tools/include/linux/objtool.h
@@ -29,11 +29,14 @@ struct unwind_hint {
*
* UNWIND_HINT_TYPE_REGS_PARTIAL: Used in entry code to indicate that
* sp_reg+sp_offset points to the iret return frame.
+ *
+ * UNWIND_HINT_FUNC: Generate the unwind metadata of a callable function.
+ * Useful for code which doesn't have an ELF function annotation.
*/
#define UNWIND_HINT_TYPE_CALL 0
#define UNWIND_HINT_TYPE_REGS 1
#define UNWIND_HINT_TYPE_REGS_PARTIAL 2
-#define UNWIND_HINT_TYPE_RET_OFFSET 3
+#define UNWIND_HINT_TYPE_FUNC 3
#ifdef CONFIG_STACK_VALIDATION
@@ -109,6 +112,12 @@ struct unwind_hint {
.popsection
.endm
+.macro STACK_FRAME_NON_STANDARD func:req
+ .pushsection .discard.func_stack_frame_non_standard, "aw"
+ .long \func - .
+ .popsection
+.endm
+
#endif /* __ASSEMBLY__ */
#else /* !CONFIG_STACK_VALIDATION */
@@ -122,6 +131,8 @@ struct unwind_hint {
#define ANNOTATE_INTRA_FUNCTION_CALL
.macro UNWIND_HINT sp_reg:req sp_offset=0 type:req end=0
.endm
+.macro STACK_FRAME_NON_STANDARD func:req
+.endm
#endif
#endif /* CONFIG_STACK_VALIDATION */
diff --git a/tools/include/linux/rbtree.h b/tools/include/linux/rbtree.h
index 30dd21f976c30..2680f2edb837a 100644
--- a/tools/include/linux/rbtree.h
+++ b/tools/include/linux/rbtree.h
@@ -152,4 +152,194 @@ static inline void rb_replace_node_cached(struct rb_node *victim,
rb_replace_node(victim, new, &root->rb_root);
}
-#endif /* __TOOLS_LINUX_PERF_RBTREE_H */
+/*
+ * The below helper functions use 2 operators with 3 different
+ * calling conventions. The operators are related like:
+ *
+ * comp(a->key,b) < 0 := less(a,b)
+ * comp(a->key,b) > 0 := less(b,a)
+ * comp(a->key,b) == 0 := !less(a,b) && !less(b,a)
+ *
+ * If these operators define a partial order on the elements we make no
+ * guarantee on which of the elements matching the key is found. See
+ * rb_find().
+ *
+ * The reason for this is to allow the find() interface without requiring an
+ * on-stack dummy object, which might not be feasible due to object size.
+ */
+
+/**
+ * rb_add_cached() - insert @node into the leftmost cached tree @tree
+ * @node: node to insert
+ * @tree: leftmost cached tree to insert @node into
+ * @less: operator defining the (partial) node order
+ */
+static __always_inline void
+rb_add_cached(struct rb_node *node, struct rb_root_cached *tree,
+ bool (*less)(struct rb_node *, const struct rb_node *))
+{
+ struct rb_node **link = &tree->rb_root.rb_node;
+ struct rb_node *parent = NULL;
+ bool leftmost = true;
+
+ while (*link) {
+ parent = *link;
+ if (less(node, parent)) {
+ link = &parent->rb_left;
+ } else {
+ link = &parent->rb_right;
+ leftmost = false;
+ }
+ }
+
+ rb_link_node(node, parent, link);
+ rb_insert_color_cached(node, tree, leftmost);
+}
+
+/**
+ * rb_add() - insert @node into @tree
+ * @node: node to insert
+ * @tree: tree to insert @node into
+ * @less: operator defining the (partial) node order
+ */
+static __always_inline void
+rb_add(struct rb_node *node, struct rb_root *tree,
+ bool (*less)(struct rb_node *, const struct rb_node *))
+{
+ struct rb_node **link = &tree->rb_node;
+ struct rb_node *parent = NULL;
+
+ while (*link) {
+ parent = *link;
+ if (less(node, parent))
+ link = &parent->rb_left;
+ else
+ link = &parent->rb_right;
+ }
+
+ rb_link_node(node, parent, link);
+ rb_insert_color(node, tree);
+}
+
+/**
+ * rb_find_add() - find equivalent @node in @tree, or add @node
+ * @node: node to look-for / insert
+ * @tree: tree to search / modify
+ * @cmp: operator defining the node order
+ *
+ * Returns the rb_node matching @node, or NULL when no match is found and @node
+ * is inserted.
+ */
+static __always_inline struct rb_node *
+rb_find_add(struct rb_node *node, struct rb_root *tree,
+ int (*cmp)(struct rb_node *, const struct rb_node *))
+{
+ struct rb_node **link = &tree->rb_node;
+ struct rb_node *parent = NULL;
+ int c;
+
+ while (*link) {
+ parent = *link;
+ c = cmp(node, parent);
+
+ if (c < 0)
+ link = &parent->rb_left;
+ else if (c > 0)
+ link = &parent->rb_right;
+ else
+ return parent;
+ }
+
+ rb_link_node(node, parent, link);
+ rb_insert_color(node, tree);
+ return NULL;
+}
+
+/**
+ * rb_find() - find @key in tree @tree
+ * @key: key to match
+ * @tree: tree to search
+ * @cmp: operator defining the node order
+ *
+ * Returns the rb_node matching @key or NULL.
+ */
+static __always_inline struct rb_node *
+rb_find(const void *key, const struct rb_root *tree,
+ int (*cmp)(const void *key, const struct rb_node *))
+{
+ struct rb_node *node = tree->rb_node;
+
+ while (node) {
+ int c = cmp(key, node);
+
+ if (c < 0)
+ node = node->rb_left;
+ else if (c > 0)
+ node = node->rb_right;
+ else
+ return node;
+ }
+
+ return NULL;
+}
+
+/**
+ * rb_find_first() - find the first @key in @tree
+ * @key: key to match
+ * @tree: tree to search
+ * @cmp: operator defining node order
+ *
+ * Returns the leftmost node matching @key, or NULL.
+ */
+static __always_inline struct rb_node *
+rb_find_first(const void *key, const struct rb_root *tree,
+ int (*cmp)(const void *key, const struct rb_node *))
+{
+ struct rb_node *node = tree->rb_node;
+ struct rb_node *match = NULL;
+
+ while (node) {
+ int c = cmp(key, node);
+
+ if (c <= 0) {
+ if (!c)
+ match = node;
+ node = node->rb_left;
+ } else if (c > 0) {
+ node = node->rb_right;
+ }
+ }
+
+ return match;
+}
+
+/**
+ * rb_next_match() - find the next @key in @tree
+ * @key: key to match
+ * @tree: tree to search
+ * @cmp: operator defining node order
+ *
+ * Returns the next node matching @key, or NULL.
+ */
+static __always_inline struct rb_node *
+rb_next_match(const void *key, struct rb_node *node,
+ int (*cmp)(const void *key, const struct rb_node *))
+{
+ node = rb_next(node);
+ if (node && cmp(key, node))
+ node = NULL;
+ return node;
+}
+
+/**
+ * rb_for_each() - iterates a subtree matching @key
+ * @node: iterator
+ * @key: key to match
+ * @tree: tree to search
+ * @cmp: operator defining node order
+ */
+#define rb_for_each(node, key, tree, cmp) \
+ for ((node) = rb_find_first((key), (tree), (cmp)); \
+ (node); (node) = rb_next_match((key), (node), (cmp)))
+
+#endif /* __TOOLS_LINUX_PERF_RBTREE_H */
diff --git a/tools/include/linux/static_call_types.h b/tools/include/linux/static_call_types.h
index 89135bb35bf76..ae5662d368b98 100644
--- a/tools/include/linux/static_call_types.h
+++ b/tools/include/linux/static_call_types.h
@@ -4,11 +4,13 @@
#include <linux/types.h>
#include <linux/stringify.h>
+#include <linux/compiler.h>
#define STATIC_CALL_KEY_PREFIX __SCK__
#define STATIC_CALL_KEY_PREFIX_STR __stringify(STATIC_CALL_KEY_PREFIX)
#define STATIC_CALL_KEY_PREFIX_LEN (sizeof(STATIC_CALL_KEY_PREFIX_STR) - 1)
#define STATIC_CALL_KEY(name) __PASTE(STATIC_CALL_KEY_PREFIX, name)
+#define STATIC_CALL_KEY_STR(name) __stringify(STATIC_CALL_KEY(name))
#define STATIC_CALL_TRAMP_PREFIX __SCT__
#define STATIC_CALL_TRAMP_PREFIX_STR __stringify(STATIC_CALL_TRAMP_PREFIX)
@@ -32,4 +34,52 @@ struct static_call_site {
s32 key;
};
+#define DECLARE_STATIC_CALL(name, func) \
+ extern struct static_call_key STATIC_CALL_KEY(name); \
+ extern typeof(func) STATIC_CALL_TRAMP(name);
+
+#ifdef CONFIG_HAVE_STATIC_CALL
+
+#define __raw_static_call(name) (&STATIC_CALL_TRAMP(name))
+
+#ifdef CONFIG_HAVE_STATIC_CALL_INLINE
+
+/*
+ * __ADDRESSABLE() is used to ensure the key symbol doesn't get stripped from
+ * the symbol table so that objtool can reference it when it generates the
+ * .static_call_sites section.
+ */
+#define __STATIC_CALL_ADDRESSABLE(name) \
+ __ADDRESSABLE(STATIC_CALL_KEY(name))
+
+#define __static_call(name) \
+({ \
+ __STATIC_CALL_ADDRESSABLE(name); \
+ __raw_static_call(name); \
+})
+
+#else /* !CONFIG_HAVE_STATIC_CALL_INLINE */
+
+#define __STATIC_CALL_ADDRESSABLE(name)
+#define __static_call(name) __raw_static_call(name)
+
+#endif /* CONFIG_HAVE_STATIC_CALL_INLINE */
+
+#ifdef MODULE
+#define __STATIC_CALL_MOD_ADDRESSABLE(name)
+#define static_call_mod(name) __raw_static_call(name)
+#else
+#define __STATIC_CALL_MOD_ADDRESSABLE(name) __STATIC_CALL_ADDRESSABLE(name)
+#define static_call_mod(name) __static_call(name)
+#endif
+
+#define static_call(name) __static_call(name)
+
+#else
+
+#define static_call(name) \
+ ((typeof(STATIC_CALL_TRAMP(name))*)(STATIC_CALL_KEY(name).func))
+
+#endif /* CONFIG_HAVE_STATIC_CALL */
+
#endif /* _STATIC_CALL_TYPES_H */
diff --git a/tools/include/nolibc/nolibc.h b/tools/include/nolibc/nolibc.h
index e61d36cd4e501..8b7a9830dd221 100644
--- a/tools/include/nolibc/nolibc.h
+++ b/tools/include/nolibc/nolibc.h
@@ -71,7 +71,7 @@
*
* A simple static executable may be built this way :
* $ gcc -fno-asynchronous-unwind-tables -fno-ident -s -Os -nostdlib \
- * -static -include nolibc.h -lgcc -o hello hello.c
+ * -static -include nolibc.h -o hello hello.c -lgcc
*
* A very useful calling convention table may be found here :
* http://man7.org/linux/man-pages/man2/syscall.2.html
@@ -81,19 +81,12 @@
*
*/
-/* Some archs (at least aarch64) don't expose the regular syscalls anymore by
- * default, either because they have an "_at" replacement, or because there are
- * more modern alternatives. For now we'd rather still use them.
- */
-#define __ARCH_WANT_SYSCALL_NO_AT
-#define __ARCH_WANT_SYSCALL_NO_FLAGS
-#define __ARCH_WANT_SYSCALL_DEPRECATED
-
#include <asm/unistd.h>
#include <asm/ioctls.h>
#include <asm/errno.h>
#include <linux/fs.h>
#include <linux/loop.h>
+#include <linux/time.h>
#define NOLIBC
@@ -152,24 +145,6 @@ struct pollfd {
short int revents;
};
-/* for select() */
-struct timeval {
- long tv_sec;
- long tv_usec;
-};
-
-/* for pselect() */
-struct timespec {
- long tv_sec;
- long tv_nsec;
-};
-
-/* for gettimeofday() */
-struct timezone {
- int tz_minuteswest;
- int tz_dsttime;
-};
-
/* for getdents64() */
struct linux_dirent64 {
uint64_t d_ino;
@@ -271,6 +246,8 @@ struct stat {
#define WEXITSTATUS(status) (((status) & 0xff00) >> 8)
#define WIFEXITED(status) (((status) & 0x7f) == 0)
+/* for SIGCHLD */
+#include <asm/signal.h>
/* Below comes the architecture-specific code. For each architecture, we have
* the syscall declarations and the _start code definition. This is the only
@@ -1469,8 +1446,10 @@ int sys_chmod(const char *path, mode_t mode)
{
#ifdef __NR_fchmodat
return my_syscall4(__NR_fchmodat, AT_FDCWD, path, mode, 0);
-#else
+#elif defined(__NR_chmod)
return my_syscall2(__NR_chmod, path, mode);
+#else
+#error Neither __NR_fchmodat nor __NR_chmod defined, cannot implement sys_chmod()
#endif
}
@@ -1479,8 +1458,10 @@ int sys_chown(const char *path, uid_t owner, gid_t group)
{
#ifdef __NR_fchownat
return my_syscall5(__NR_fchownat, AT_FDCWD, path, owner, group, 0);
-#else
+#elif defined(__NR_chown)
return my_syscall3(__NR_chown, path, owner, group);
+#else
+#error Neither __NR_fchownat nor __NR_chown defined, cannot implement sys_chown()
#endif
}
@@ -1502,10 +1483,24 @@ int sys_dup(int fd)
return my_syscall1(__NR_dup, fd);
}
+#ifdef __NR_dup3
+static __attribute__((unused))
+int sys_dup3(int old, int new, int flags)
+{
+ return my_syscall3(__NR_dup3, old, new, flags);
+}
+#endif
+
static __attribute__((unused))
int sys_dup2(int old, int new)
{
+#ifdef __NR_dup3
+ return my_syscall3(__NR_dup3, old, new, 0);
+#elif defined(__NR_dup2)
return my_syscall2(__NR_dup2, old, new);
+#else
+#error Neither __NR_dup3 nor __NR_dup2 defined, cannot implement sys_dup2()
+#endif
}
static __attribute__((unused))
@@ -1517,7 +1512,17 @@ int sys_execve(const char *filename, char *const argv[], char *const envp[])
static __attribute__((unused))
pid_t sys_fork(void)
{
+#ifdef __NR_clone
+ /* note: some archs only have clone() and not fork(). Different archs
+ * have a different API, but most archs have the flags on first arg and
+ * will not use the rest with no other flag.
+ */
+ return my_syscall5(__NR_clone, SIGCHLD, 0, 0, 0, 0);
+#elif defined(__NR_fork)
return my_syscall0(__NR_fork);
+#else
+#error Neither __NR_clone nor __NR_fork defined, cannot implement sys_fork()
+#endif
}
static __attribute__((unused))
@@ -1533,9 +1538,15 @@ int sys_getdents64(int fd, struct linux_dirent64 *dirp, int count)
}
static __attribute__((unused))
+pid_t sys_getpgid(pid_t pid)
+{
+ return my_syscall1(__NR_getpgid, pid);
+}
+
+static __attribute__((unused))
pid_t sys_getpgrp(void)
{
- return my_syscall0(__NR_getpgrp);
+ return sys_getpgid(0);
}
static __attribute__((unused))
@@ -1567,8 +1578,10 @@ int sys_link(const char *old, const char *new)
{
#ifdef __NR_linkat
return my_syscall5(__NR_linkat, AT_FDCWD, old, AT_FDCWD, new, 0);
-#else
+#elif defined(__NR_link)
return my_syscall2(__NR_link, old, new);
+#else
+#error Neither __NR_linkat nor __NR_link defined, cannot implement sys_link()
#endif
}
@@ -1583,8 +1596,10 @@ int sys_mkdir(const char *path, mode_t mode)
{
#ifdef __NR_mkdirat
return my_syscall3(__NR_mkdirat, AT_FDCWD, path, mode);
-#else
+#elif defined(__NR_mkdir)
return my_syscall2(__NR_mkdir, path, mode);
+#else
+#error Neither __NR_mkdirat nor __NR_mkdir defined, cannot implement sys_mkdir()
#endif
}
@@ -1593,8 +1608,10 @@ long sys_mknod(const char *path, mode_t mode, dev_t dev)
{
#ifdef __NR_mknodat
return my_syscall4(__NR_mknodat, AT_FDCWD, path, mode, dev);
-#else
+#elif defined(__NR_mknod)
return my_syscall3(__NR_mknod, path, mode, dev);
+#else
+#error Neither __NR_mknodat nor __NR_mknod defined, cannot implement sys_mknod()
#endif
}
@@ -1610,8 +1627,10 @@ int sys_open(const char *path, int flags, mode_t mode)
{
#ifdef __NR_openat
return my_syscall4(__NR_openat, AT_FDCWD, path, flags, mode);
-#else
+#elif defined(__NR_open)
return my_syscall3(__NR_open, path, flags, mode);
+#else
+#error Neither __NR_openat nor __NR_open defined, cannot implement sys_open()
#endif
}
@@ -1624,7 +1643,19 @@ int sys_pivot_root(const char *new, const char *old)
static __attribute__((unused))
int sys_poll(struct pollfd *fds, int nfds, int timeout)
{
+#if defined(__NR_ppoll)
+ struct timespec t;
+
+ if (timeout >= 0) {
+ t.tv_sec = timeout / 1000;
+ t.tv_nsec = (timeout % 1000) * 1000000;
+ }
+ return my_syscall4(__NR_ppoll, fds, nfds, (timeout >= 0) ? &t : NULL, NULL);
+#elif defined(__NR_poll)
return my_syscall3(__NR_poll, fds, nfds, timeout);
+#else
+#error Neither __NR_ppoll nor __NR_poll defined, cannot implement sys_poll()
+#endif
}
static __attribute__((unused))
@@ -1663,11 +1694,13 @@ int sys_select(int nfds, fd_set *rfds, fd_set *wfds, fd_set *efds, struct timeva
t.tv_nsec = timeout->tv_usec * 1000;
}
return my_syscall6(__NR_pselect6, nfds, rfds, wfds, efds, timeout ? &t : NULL, NULL);
-#else
+#elif defined(__NR__newselect) || defined(__NR_select)
#ifndef __NR__newselect
#define __NR__newselect __NR_select
#endif
return my_syscall5(__NR__newselect, nfds, rfds, wfds, efds, timeout);
+#else
+#error None of __NR_select, __NR_pselect6, nor __NR__newselect defined, cannot implement sys_select()
#endif
}
@@ -1692,8 +1725,10 @@ int sys_stat(const char *path, struct stat *buf)
#ifdef __NR_newfstatat
/* only solution for arm64 */
ret = my_syscall4(__NR_newfstatat, AT_FDCWD, path, &stat, 0);
-#else
+#elif defined(__NR_stat)
ret = my_syscall2(__NR_stat, path, &stat);
+#else
+#error Neither __NR_newfstatat nor __NR_stat defined, cannot implement sys_stat()
#endif
buf->st_dev = stat.st_dev;
buf->st_ino = stat.st_ino;
@@ -1717,8 +1752,10 @@ int sys_symlink(const char *old, const char *new)
{
#ifdef __NR_symlinkat
return my_syscall3(__NR_symlinkat, old, AT_FDCWD, new);
-#else
+#elif defined(__NR_symlink)
return my_syscall2(__NR_symlink, old, new);
+#else
+#error Neither __NR_symlinkat nor __NR_symlink defined, cannot implement sys_symlink()
#endif
}
@@ -1739,8 +1776,10 @@ int sys_unlink(const char *path)
{
#ifdef __NR_unlinkat
return my_syscall3(__NR_unlinkat, AT_FDCWD, path, 0);
-#else
+#elif defined(__NR_unlink)
return my_syscall1(__NR_unlink, path);
+#else
+#error Neither __NR_unlinkat nor __NR_unlink defined, cannot implement sys_unlink()
#endif
}
@@ -1853,6 +1892,18 @@ int close(int fd)
}
static __attribute__((unused))
+int dup(int fd)
+{
+ int ret = sys_dup(fd);
+
+ if (ret < 0) {
+ SET_ERRNO(-ret);
+ ret = -1;
+ }
+ return ret;
+}
+
+static __attribute__((unused))
int dup2(int old, int new)
{
int ret = sys_dup2(old, new);
@@ -1864,6 +1915,20 @@ int dup2(int old, int new)
return ret;
}
+#ifdef __NR_dup3
+static __attribute__((unused))
+int dup3(int old, int new, int flags)
+{
+ int ret = sys_dup3(old, new, flags);
+
+ if (ret < 0) {
+ SET_ERRNO(-ret);
+ ret = -1;
+ }
+ return ret;
+}
+#endif
+
static __attribute__((unused))
int execve(const char *filename, char *const argv[], char *const envp[])
{
@@ -1913,6 +1978,18 @@ int getdents64(int fd, struct linux_dirent64 *dirp, int count)
}
static __attribute__((unused))
+pid_t getpgid(pid_t pid)
+{
+ pid_t ret = sys_getpgid(pid);
+
+ if (ret < 0) {
+ SET_ERRNO(-ret);
+ ret = -1;
+ }
+ return ret;
+}
+
+static __attribute__((unused))
pid_t getpgrp(void)
{
pid_t ret = sys_getpgrp();
diff --git a/tools/memory-model/Documentation/glossary.txt b/tools/memory-model/Documentation/glossary.txt
index 79acb75d56eaa..b2da6365be63c 100644
--- a/tools/memory-model/Documentation/glossary.txt
+++ b/tools/memory-model/Documentation/glossary.txt
@@ -33,10 +33,11 @@ Acquire: With respect to a lock, acquiring that lock, for example,
acquire loads.
When an acquire load returns the value stored by a release store
- to that same variable, then all operations preceding that store
- happen before any operations following that load acquire.
+ to that same variable, (in other words, the acquire load "reads
+ from" the release store), then all operations preceding that
+ store "happen before" any operations following that load acquire.
- See also "Relaxed" and "Release".
+ See also "Happens-Before", "Reads-From", "Relaxed", and "Release".
Coherence (co): When one CPU's store to a given variable overwrites
either the value from another CPU's store or some later value,
@@ -119,6 +120,11 @@ Fully Ordered: An operation such as smp_mb() that orders all of
that orders all of its CPU's prior accesses, itself, and
all of its CPU's subsequent accesses.
+Happens-Before (hb): A relation between two accesses in which LKMM
+ guarantees the first access precedes the second. For more
+ detail, please see the "THE HAPPENS-BEFORE RELATION: hb"
+ section of explanation.txt.
+
Marked Access: An access to a variable that uses an special function or
macro such as "r1 = READ_ONCE(x)" or "smp_store_release(&a, 1)".
diff --git a/tools/memory-model/README b/tools/memory-model/README
index 39d08d1f0443f..9a84c45504ab6 100644
--- a/tools/memory-model/README
+++ b/tools/memory-model/README
@@ -51,7 +51,7 @@ klitmus7 Compatibility Table
============ ==========
target Linux herdtools7
------------ ----------
- -- 4.18 7.48 --
+ -- 4.14 7.48 --
4.15 -- 4.19 7.49 --
4.20 -- 5.5 7.54 --
5.6 -- 7.56 --
diff --git a/tools/memory-model/litmus-tests/CoRR+poonceonce+Once.litmus b/tools/memory-model/litmus-tests/CoRR+poonceonce+Once.litmus
index 772544f03fb5f..967f9f2a6226b 100644
--- a/tools/memory-model/litmus-tests/CoRR+poonceonce+Once.litmus
+++ b/tools/memory-model/litmus-tests/CoRR+poonceonce+Once.litmus
@@ -7,9 +7,7 @@ C CoRR+poonceonce+Once
* reads from the same variable are ordered.
*)
-{
- int x;
-}
+{}
P0(int *x)
{
diff --git a/tools/memory-model/litmus-tests/CoRW+poonceonce+Once.litmus b/tools/memory-model/litmus-tests/CoRW+poonceonce+Once.litmus
index 5faae98f7ffb3..4635739f3974d 100644
--- a/tools/memory-model/litmus-tests/CoRW+poonceonce+Once.litmus
+++ b/tools/memory-model/litmus-tests/CoRW+poonceonce+Once.litmus
@@ -7,9 +7,7 @@ C CoRW+poonceonce+Once
* a given variable and a later write to that same variable are ordered.
*)
-{
- int x;
-}
+{}
P0(int *x)
{
diff --git a/tools/memory-model/litmus-tests/CoWR+poonceonce+Once.litmus b/tools/memory-model/litmus-tests/CoWR+poonceonce+Once.litmus
index 77c9cc9f8dc66..bb068c92d8da2 100644
--- a/tools/memory-model/litmus-tests/CoWR+poonceonce+Once.litmus
+++ b/tools/memory-model/litmus-tests/CoWR+poonceonce+Once.litmus
@@ -7,9 +7,7 @@ C CoWR+poonceonce+Once
* given variable and a later read from that same variable are ordered.
*)
-{
- int x;
-}
+{}
P0(int *x)
{
diff --git a/tools/memory-model/litmus-tests/CoWW+poonceonce.litmus b/tools/memory-model/litmus-tests/CoWW+poonceonce.litmus
index 85ef746f511a7..0d9f0a9587996 100644
--- a/tools/memory-model/litmus-tests/CoWW+poonceonce.litmus
+++ b/tools/memory-model/litmus-tests/CoWW+poonceonce.litmus
@@ -7,9 +7,7 @@ C CoWW+poonceonce
* writes to the same variable are ordered.
*)
-{
- int x;
-}
+{}
P0(int *x)
{
diff --git a/tools/memory-model/litmus-tests/IRIW+fencembonceonces+OnceOnce.litmus b/tools/memory-model/litmus-tests/IRIW+fencembonceonces+OnceOnce.litmus
index 87aa900125ab2..e729d2776e89a 100644
--- a/tools/memory-model/litmus-tests/IRIW+fencembonceonces+OnceOnce.litmus
+++ b/tools/memory-model/litmus-tests/IRIW+fencembonceonces+OnceOnce.litmus
@@ -10,10 +10,7 @@ C IRIW+fencembonceonces+OnceOnce
* process? This litmus test exercises LKMM's "propagation" rule.
*)
-{
- int x;
- int y;
-}
+{}
P0(int *x)
{
diff --git a/tools/memory-model/litmus-tests/IRIW+poonceonces+OnceOnce.litmus b/tools/memory-model/litmus-tests/IRIW+poonceonces+OnceOnce.litmus
index f84022dca5551..4b54dd6a6cd94 100644
--- a/tools/memory-model/litmus-tests/IRIW+poonceonces+OnceOnce.litmus
+++ b/tools/memory-model/litmus-tests/IRIW+poonceonces+OnceOnce.litmus
@@ -10,10 +10,7 @@ C IRIW+poonceonces+OnceOnce
* different process?
*)
-{
- int x;
- int y;
-}
+{}
P0(int *x)
{
diff --git a/tools/memory-model/litmus-tests/ISA2+pooncelock+pooncelock+pombonce.litmus b/tools/memory-model/litmus-tests/ISA2+pooncelock+pooncelock+pombonce.litmus
index 398f624daa771..094d58df77896 100644
--- a/tools/memory-model/litmus-tests/ISA2+pooncelock+pooncelock+pombonce.litmus
+++ b/tools/memory-model/litmus-tests/ISA2+pooncelock+pooncelock+pombonce.litmus
@@ -7,12 +7,7 @@ C ISA2+pooncelock+pooncelock+pombonce
* (in P0() and P1()) is visible to external process P2().
*)
-{
- spinlock_t mylock;
- int x;
- int y;
- int z;
-}
+{}
P0(int *x, int *y, spinlock_t *mylock)
{
diff --git a/tools/memory-model/litmus-tests/ISA2+poonceonces.litmus b/tools/memory-model/litmus-tests/ISA2+poonceonces.litmus
index 212a432ba16ba..b321aa6f4ea52 100644
--- a/tools/memory-model/litmus-tests/ISA2+poonceonces.litmus
+++ b/tools/memory-model/litmus-tests/ISA2+poonceonces.litmus
@@ -9,11 +9,7 @@ C ISA2+poonceonces
* of the smp_load_acquire() invocations are replaced by READ_ONCE()?
*)
-{
- int x;
- int y;
- int z;
-}
+{}
P0(int *x, int *y)
{
diff --git a/tools/memory-model/litmus-tests/ISA2+pooncerelease+poacquirerelease+poacquireonce.litmus b/tools/memory-model/litmus-tests/ISA2+pooncerelease+poacquirerelease+poacquireonce.litmus
index 7afd85672ccde..025b0462ec9bc 100644
--- a/tools/memory-model/litmus-tests/ISA2+pooncerelease+poacquirerelease+poacquireonce.litmus
+++ b/tools/memory-model/litmus-tests/ISA2+pooncerelease+poacquirerelease+poacquireonce.litmus
@@ -11,11 +11,7 @@ C ISA2+pooncerelease+poacquirerelease+poacquireonce
* (AKA non-rf) link, so release-acquire is all that is needed.
*)
-{
- int x;
- int y;
- int z;
-}
+{}
P0(int *x, int *y)
{
diff --git a/tools/memory-model/litmus-tests/LB+fencembonceonce+ctrlonceonce.litmus b/tools/memory-model/litmus-tests/LB+fencembonceonce+ctrlonceonce.litmus
index c8a93c7ee556a..4727f5aaf03b0 100644
--- a/tools/memory-model/litmus-tests/LB+fencembonceonce+ctrlonceonce.litmus
+++ b/tools/memory-model/litmus-tests/LB+fencembonceonce+ctrlonceonce.litmus
@@ -11,10 +11,7 @@ C LB+fencembonceonce+ctrlonceonce
* another control dependency and order would still be maintained.)
*)
-{
- int x;
- int y;
-}
+{}
P0(int *x, int *y)
{
diff --git a/tools/memory-model/litmus-tests/LB+poacquireonce+pooncerelease.litmus b/tools/memory-model/litmus-tests/LB+poacquireonce+pooncerelease.litmus
index 2fa029568fa1c..07b9904b0e49f 100644
--- a/tools/memory-model/litmus-tests/LB+poacquireonce+pooncerelease.litmus
+++ b/tools/memory-model/litmus-tests/LB+poacquireonce+pooncerelease.litmus
@@ -8,10 +8,7 @@ C LB+poacquireonce+pooncerelease
* to the other?
*)
-{
- int x;
- int y;
-}
+{}
P0(int *x, int *y)
{
diff --git a/tools/memory-model/litmus-tests/LB+poonceonces.litmus b/tools/memory-model/litmus-tests/LB+poonceonces.litmus
index 2107306e8625b..74c49cb3c37bf 100644
--- a/tools/memory-model/litmus-tests/LB+poonceonces.litmus
+++ b/tools/memory-model/litmus-tests/LB+poonceonces.litmus
@@ -7,10 +7,7 @@ C LB+poonceonces
* be prevented even with no explicit ordering?
*)
-{
- int x;
- int y;
-}
+{}
P0(int *x, int *y)
{
diff --git a/tools/memory-model/litmus-tests/MP+fencewmbonceonce+fencermbonceonce.litmus b/tools/memory-model/litmus-tests/MP+fencewmbonceonce+fencermbonceonce.litmus
index c5c168d929737..f8ca1229857ad 100644
--- a/tools/memory-model/litmus-tests/MP+fencewmbonceonce+fencermbonceonce.litmus
+++ b/tools/memory-model/litmus-tests/MP+fencewmbonceonce+fencermbonceonce.litmus
@@ -8,10 +8,7 @@ C MP+fencewmbonceonce+fencermbonceonce
* is usually better to use smp_store_release() and smp_load_acquire().
*)
-{
- int buf;
- int flag;
-}
+{}
P0(int *buf, int *flag) // Producer
{
diff --git a/tools/memory-model/litmus-tests/MP+onceassign+derefonce.litmus b/tools/memory-model/litmus-tests/MP+onceassign+derefonce.litmus
index 20ff62649f1ee..d84160b9c1ae0 100644
--- a/tools/memory-model/litmus-tests/MP+onceassign+derefonce.litmus
+++ b/tools/memory-model/litmus-tests/MP+onceassign+derefonce.litmus
@@ -10,9 +10,7 @@ C MP+onceassign+derefonce
*)
{
- int *p=y;
- int x;
- int y=0;
+p=y;
}
P0(int *x, int **p) // Producer
diff --git a/tools/memory-model/litmus-tests/MP+polockmbonce+poacquiresilsil.litmus b/tools/memory-model/litmus-tests/MP+polockmbonce+poacquiresilsil.litmus
index 153917ad5dc95..ba91cc63e1487 100644
--- a/tools/memory-model/litmus-tests/MP+polockmbonce+poacquiresilsil.litmus
+++ b/tools/memory-model/litmus-tests/MP+polockmbonce+poacquiresilsil.litmus
@@ -10,10 +10,7 @@ C MP+polockmbonce+poacquiresilsil
* executed before the lock was acquired (loosely speaking).
*)
-{
- spinlock_t lo;
- int x;
-}
+{}
P0(spinlock_t *lo, int *x) // Producer
{
diff --git a/tools/memory-model/litmus-tests/MP+polockonce+poacquiresilsil.litmus b/tools/memory-model/litmus-tests/MP+polockonce+poacquiresilsil.litmus
index aad64397bb8cd..a5ea3ed8f52eb 100644
--- a/tools/memory-model/litmus-tests/MP+polockonce+poacquiresilsil.litmus
+++ b/tools/memory-model/litmus-tests/MP+polockonce+poacquiresilsil.litmus
@@ -10,10 +10,7 @@ C MP+polockonce+poacquiresilsil
* speaking).
*)
-{
- spinlock_t lo;
- int x;
-}
+{}
P0(spinlock_t *lo, int *x) // Producer
{
diff --git a/tools/memory-model/litmus-tests/MP+polocks.litmus b/tools/memory-model/litmus-tests/MP+polocks.litmus
index 21cbca6f3be4c..e6af05f70069f 100644
--- a/tools/memory-model/litmus-tests/MP+polocks.litmus
+++ b/tools/memory-model/litmus-tests/MP+polocks.litmus
@@ -11,11 +11,7 @@ C MP+polocks
* to see all prior accesses by those other CPUs.
*)
-{
- spinlock_t mylock;
- int buf;
- int flag;
-}
+{}
P0(int *buf, int *flag, spinlock_t *mylock) // Producer
{
diff --git a/tools/memory-model/litmus-tests/MP+poonceonces.litmus b/tools/memory-model/litmus-tests/MP+poonceonces.litmus
index 9f9769d647c7b..ba9c99c6cf65d 100644
--- a/tools/memory-model/litmus-tests/MP+poonceonces.litmus
+++ b/tools/memory-model/litmus-tests/MP+poonceonces.litmus
@@ -7,10 +7,7 @@ C MP+poonceonces
* no ordering at all?
*)
-{
- int buf;
- int flag;
-}
+{}
P0(int *buf, int *flag) // Producer
{
diff --git a/tools/memory-model/litmus-tests/MP+pooncerelease+poacquireonce.litmus b/tools/memory-model/litmus-tests/MP+pooncerelease+poacquireonce.litmus
index cbe28e7334437..f174bfe61702c 100644
--- a/tools/memory-model/litmus-tests/MP+pooncerelease+poacquireonce.litmus
+++ b/tools/memory-model/litmus-tests/MP+pooncerelease+poacquireonce.litmus
@@ -8,10 +8,7 @@ C MP+pooncerelease+poacquireonce
* pattern.
*)
-{
- int buf;
- int flag;
-}
+{}
P0(int *buf, int *flag) // Producer
{
diff --git a/tools/memory-model/litmus-tests/MP+porevlocks.litmus b/tools/memory-model/litmus-tests/MP+porevlocks.litmus
index 012041bd4feb3..b9599141160e6 100644
--- a/tools/memory-model/litmus-tests/MP+porevlocks.litmus
+++ b/tools/memory-model/litmus-tests/MP+porevlocks.litmus
@@ -11,11 +11,7 @@ C MP+porevlocks
* see all prior accesses by those other CPUs.
*)
-{
- spinlock_t mylock;
- int buf;
- int flag;
-}
+{}
P0(int *buf, int *flag, spinlock_t *mylock) // Consumer
{
diff --git a/tools/memory-model/litmus-tests/R+fencembonceonces.litmus b/tools/memory-model/litmus-tests/R+fencembonceonces.litmus
index af9463b39b4a5..222a0b850b4a5 100644
--- a/tools/memory-model/litmus-tests/R+fencembonceonces.litmus
+++ b/tools/memory-model/litmus-tests/R+fencembonceonces.litmus
@@ -9,10 +9,7 @@ C R+fencembonceonces
* cause the resulting test to be allowed.
*)
-{
- int x;
- int y;
-}
+{}
P0(int *x, int *y)
{
diff --git a/tools/memory-model/litmus-tests/R+poonceonces.litmus b/tools/memory-model/litmus-tests/R+poonceonces.litmus
index bcd5574e304ae..5386f128a131d 100644
--- a/tools/memory-model/litmus-tests/R+poonceonces.litmus
+++ b/tools/memory-model/litmus-tests/R+poonceonces.litmus
@@ -8,10 +8,7 @@ C R+poonceonces
* store propagation delays.
*)
-{
- int x;
- int y;
-}
+{}
P0(int *x, int *y)
{
diff --git a/tools/memory-model/litmus-tests/S+fencewmbonceonce+poacquireonce.litmus b/tools/memory-model/litmus-tests/S+fencewmbonceonce+poacquireonce.litmus
index c36341d1aed66..18479823cd6cc 100644
--- a/tools/memory-model/litmus-tests/S+fencewmbonceonce+poacquireonce.litmus
+++ b/tools/memory-model/litmus-tests/S+fencewmbonceonce+poacquireonce.litmus
@@ -7,10 +7,7 @@ C S+fencewmbonceonce+poacquireonce
* store against a subsequent store?
*)
-{
- int x;
- int y;
-}
+{}
P0(int *x, int *y)
{
diff --git a/tools/memory-model/litmus-tests/S+poonceonces.litmus b/tools/memory-model/litmus-tests/S+poonceonces.litmus
index 7775c23143a0c..8c9c2f81a5805 100644
--- a/tools/memory-model/litmus-tests/S+poonceonces.litmus
+++ b/tools/memory-model/litmus-tests/S+poonceonces.litmus
@@ -9,10 +9,7 @@ C S+poonceonces
* READ_ONCE(), is ordering preserved?
*)
-{
- int x;
- int y;
-}
+{}
P0(int *x, int *y)
{
diff --git a/tools/memory-model/litmus-tests/SB+fencembonceonces.litmus b/tools/memory-model/litmus-tests/SB+fencembonceonces.litmus
index 833cdfeb7c093..ed5fff18d2232 100644
--- a/tools/memory-model/litmus-tests/SB+fencembonceonces.litmus
+++ b/tools/memory-model/litmus-tests/SB+fencembonceonces.litmus
@@ -9,10 +9,7 @@ C SB+fencembonceonces
* suffice, but not much else.)
*)
-{
- int x;
- int y;
-}
+{}
P0(int *x, int *y)
{
diff --git a/tools/memory-model/litmus-tests/SB+poonceonces.litmus b/tools/memory-model/litmus-tests/SB+poonceonces.litmus
index c92211ecbfdf7..10d550730b25f 100644
--- a/tools/memory-model/litmus-tests/SB+poonceonces.litmus
+++ b/tools/memory-model/litmus-tests/SB+poonceonces.litmus
@@ -8,10 +8,7 @@ C SB+poonceonces
* variable that the preceding process reads.
*)
-{
- int x;
- int y;
-}
+{}
P0(int *x, int *y)
{
diff --git a/tools/memory-model/litmus-tests/SB+rfionceonce-poonceonces.litmus b/tools/memory-model/litmus-tests/SB+rfionceonce-poonceonces.litmus
index 84344b455eb71..04a16603660bd 100644
--- a/tools/memory-model/litmus-tests/SB+rfionceonce-poonceonces.litmus
+++ b/tools/memory-model/litmus-tests/SB+rfionceonce-poonceonces.litmus
@@ -6,10 +6,7 @@ C SB+rfionceonce-poonceonces
* This litmus test demonstrates that LKMM is not fully multicopy atomic.
*)
-{
- int x;
- int y;
-}
+{}
P0(int *x, int *y)
{
diff --git a/tools/memory-model/litmus-tests/WRC+poonceonces+Once.litmus b/tools/memory-model/litmus-tests/WRC+poonceonces+Once.litmus
index 431494708611b..6a2bc12a1af1a 100644
--- a/tools/memory-model/litmus-tests/WRC+poonceonces+Once.litmus
+++ b/tools/memory-model/litmus-tests/WRC+poonceonces+Once.litmus
@@ -8,10 +8,7 @@ C WRC+poonceonces+Once
* test has no ordering at all.
*)
-{
- int x;
- int y;
-}
+{}
P0(int *x)
{
diff --git a/tools/memory-model/litmus-tests/WRC+pooncerelease+fencermbonceonce+Once.litmus b/tools/memory-model/litmus-tests/WRC+pooncerelease+fencermbonceonce+Once.litmus
index 554999c64db58..e9947250d7de6 100644
--- a/tools/memory-model/litmus-tests/WRC+pooncerelease+fencermbonceonce+Once.litmus
+++ b/tools/memory-model/litmus-tests/WRC+pooncerelease+fencermbonceonce+Once.litmus
@@ -10,10 +10,7 @@ C WRC+pooncerelease+fencermbonceonce+Once
* is A-cumulative in LKMM.
*)
-{
- int x;
- int y;
-}
+{}
P0(int *x)
{
diff --git a/tools/memory-model/litmus-tests/Z6.0+pooncelock+poonceLock+pombonce.litmus b/tools/memory-model/litmus-tests/Z6.0+pooncelock+poonceLock+pombonce.litmus
index 265a95ffef137..415248fb66990 100644
--- a/tools/memory-model/litmus-tests/Z6.0+pooncelock+poonceLock+pombonce.litmus
+++ b/tools/memory-model/litmus-tests/Z6.0+pooncelock+poonceLock+pombonce.litmus
@@ -9,12 +9,7 @@ C Z6.0+pooncelock+poonceLock+pombonce
* by CPUs not holding that lock.
*)
-{
- spinlock_t mylock;
- int x;
- int y;
- int z;
-}
+{}
P0(int *x, int *y, spinlock_t *mylock)
{
diff --git a/tools/memory-model/litmus-tests/Z6.0+pooncelock+pooncelock+pombonce.litmus b/tools/memory-model/litmus-tests/Z6.0+pooncelock+pooncelock+pombonce.litmus
index 0c9aea8e80df0..10a2aa04cd078 100644
--- a/tools/memory-model/litmus-tests/Z6.0+pooncelock+pooncelock+pombonce.litmus
+++ b/tools/memory-model/litmus-tests/Z6.0+pooncelock+pooncelock+pombonce.litmus
@@ -8,12 +8,7 @@ C Z6.0+pooncelock+pooncelock+pombonce
* seen as ordered by a third process not holding that lock.
*)
-{
- spinlock_t mylock;
- int x;
- int y;
- int z;
-}
+{}
P0(int *x, int *y, spinlock_t *mylock)
{
diff --git a/tools/memory-model/litmus-tests/Z6.0+pooncerelease+poacquirerelease+fencembonceonce.litmus b/tools/memory-model/litmus-tests/Z6.0+pooncerelease+poacquirerelease+fencembonceonce.litmus
index 661f9aaa57914..88e70b87a683e 100644
--- a/tools/memory-model/litmus-tests/Z6.0+pooncerelease+poacquirerelease+fencembonceonce.litmus
+++ b/tools/memory-model/litmus-tests/Z6.0+pooncerelease+poacquirerelease+fencembonceonce.litmus
@@ -14,11 +14,7 @@ C Z6.0+pooncerelease+poacquirerelease+fencembonceonce
* involving locking.)
*)
-{
- int x;
- int y;
- int z;
-}
+{}
P0(int *x, int *y)
{
diff --git a/tools/objtool/.gitignore b/tools/objtool/.gitignore
index 45cefda24c7b1..14236db3677f6 100644
--- a/tools/objtool/.gitignore
+++ b/tools/objtool/.gitignore
@@ -1,4 +1,4 @@
# SPDX-License-Identifier: GPL-2.0-only
arch/x86/lib/inat-tables.c
-objtool
+/objtool
fixdep
diff --git a/tools/objtool/Documentation/stack-validation.txt b/tools/objtool/Documentation/stack-validation.txt
index 0542e46c75528..30f38fdc0d56c 100644
--- a/tools/objtool/Documentation/stack-validation.txt
+++ b/tools/objtool/Documentation/stack-validation.txt
@@ -315,13 +315,15 @@ they mean, and suggestions for how to fix them.
function tracing inserts additional calls, which is not obvious from the
sources).
-10. file.o: warning: func()+0x5c: alternative modifies stack
-
- This means that an alternative includes instructions that modify the
- stack. The problem is that there is only one ORC unwind table, this means
- that the ORC unwind entries must be valid for each of the alternatives.
- The easiest way to enforce this is to ensure alternatives do not contain
- any ORC entries, which in turn implies the above constraint.
+10. file.o: warning: func()+0x5c: stack layout conflict in alternatives
+
+ This means that in the use of the alternative() or ALTERNATIVE()
+ macro, the code paths have conflicting modifications to the stack.
+ The problem is that there is only one ORC unwind table, which means
+ that the ORC unwind entries must be consistent for all possible
+ instruction boundaries regardless of which code has been patched.
+ This limitation can be overcome by massaging the alternatives with
+ NOPs to shift the stack changes around so they no longer conflict.
11. file.o: warning: unannotated intra-function call
diff --git a/tools/objtool/Makefile b/tools/objtool/Makefile
index 5cdb19036d7f7..92ce4fce7bc73 100644
--- a/tools/objtool/Makefile
+++ b/tools/objtool/Makefile
@@ -27,6 +27,7 @@ all: $(OBJTOOL)
INCLUDES := -I$(srctree)/tools/include \
-I$(srctree)/tools/arch/$(HOSTARCH)/include/uapi \
-I$(srctree)/tools/arch/$(SRCARCH)/include \
+ -I$(srctree)/tools/objtool/include \
-I$(srctree)/tools/objtool/arch/$(SRCARCH)/include
WARNINGS := $(EXTRA_WARNINGS) -Wno-switch-default -Wno-switch-enum -Wno-packed -Wno-nested-externs
CFLAGS := -Werror $(WARNINGS) $(KBUILD_HOSTCFLAGS) -g $(INCLUDES) $(LIBELF_FLAGS)
@@ -46,10 +47,6 @@ ifeq ($(SRCARCH),x86)
SUBCMD_ORC := y
endif
-ifeq ($(SUBCMD_ORC),y)
- CFLAGS += -DINSN_USE_ORC
-endif
-
export SUBCMD_CHECK SUBCMD_ORC
export srctree OUTPUT CFLAGS SRCARCH AWK
include $(srctree)/tools/build/Makefile.include
diff --git a/tools/objtool/arch/x86/decode.c b/tools/objtool/arch/x86/decode.c
index cde9c36e40ae0..549813cff8abd 100644
--- a/tools/objtool/arch/x86/decode.c
+++ b/tools/objtool/arch/x86/decode.c
@@ -11,11 +11,11 @@
#include "../../../arch/x86/lib/inat.c"
#include "../../../arch/x86/lib/insn.c"
-#include "../../check.h"
-#include "../../elf.h"
-#include "../../arch.h"
-#include "../../warn.h"
#include <asm/orc_types.h>
+#include <objtool/check.h>
+#include <objtool/elf.h>
+#include <objtool/arch.h>
+#include <objtool/warn.h>
static unsigned char op_to_cfi_reg[][2] = {
{CFI_AX, CFI_R8},
@@ -222,15 +222,38 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec,
break;
case 0x89:
- if (rex_w && !rex_r && modrm_mod == 3 && modrm_reg == 4) {
+ if (rex_w && !rex_r && modrm_reg == 4) {
- /* mov %rsp, reg */
- ADD_OP(op) {
- op->src.type = OP_SRC_REG;
- op->src.reg = CFI_SP;
- op->dest.type = OP_DEST_REG;
- op->dest.reg = op_to_cfi_reg[modrm_rm][rex_b];
+ if (modrm_mod == 3) {
+ /* mov %rsp, reg */
+ ADD_OP(op) {
+ op->src.type = OP_SRC_REG;
+ op->src.reg = CFI_SP;
+ op->dest.type = OP_DEST_REG;
+ op->dest.reg = op_to_cfi_reg[modrm_rm][rex_b];
+ }
+ break;
+
+ } else {
+ /* skip nontrivial SIB */
+ if (modrm_rm == 4 && !(sib == 0x24 && rex_b == rex_x))
+ break;
+
+ /* skip RIP relative displacement */
+ if (modrm_rm == 5 && modrm_mod == 0)
+ break;
+
+ /* mov %rsp, disp(%reg) */
+ ADD_OP(op) {
+ op->src.type = OP_SRC_REG;
+ op->src.reg = CFI_SP;
+ op->dest.type = OP_DEST_REG_INDIRECT;
+ op->dest.reg = op_to_cfi_reg[modrm_rm][rex_b];
+ op->dest.offset = insn.displacement.value;
+ }
+ break;
}
+
break;
}
@@ -259,8 +282,10 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec,
op->dest.reg = CFI_BP;
op->dest.offset = insn.displacement.value;
}
+ break;
+ }
- } else if (rex_w && !rex_b && modrm_rm == 4 && sib == 0x24) {
+ if (rex_w && !rex_b && modrm_rm == 4 && sib == 0x24) {
/* mov reg, disp(%rsp) */
ADD_OP(op) {
@@ -270,6 +295,7 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec,
op->dest.reg = CFI_SP;
op->dest.offset = insn.displacement.value;
}
+ break;
}
break;
@@ -563,8 +589,8 @@ void arch_initial_func_cfi_state(struct cfi_init_state *state)
state->cfa.offset = 8;
/* initial RA (return address) */
- state->regs[16].base = CFI_CFA;
- state->regs[16].offset = -8;
+ state->regs[CFI_RA].base = CFI_CFA;
+ state->regs[CFI_RA].offset = -8;
}
const char *arch_nop_insn(int len)
diff --git a/tools/objtool/arch/x86/include/cfi_regs.h b/tools/objtool/arch/x86/include/arch/cfi_regs.h
index 79bc517efba85..79bc517efba85 100644
--- a/tools/objtool/arch/x86/include/cfi_regs.h
+++ b/tools/objtool/arch/x86/include/arch/cfi_regs.h
diff --git a/tools/objtool/arch/x86/include/arch_elf.h b/tools/objtool/arch/x86/include/arch/elf.h
index 69cc4264b28a8..69cc4264b28a8 100644
--- a/tools/objtool/arch/x86/include/arch_elf.h
+++ b/tools/objtool/arch/x86/include/arch/elf.h
diff --git a/tools/objtool/arch/x86/include/arch/endianness.h b/tools/objtool/arch/x86/include/arch/endianness.h
new file mode 100644
index 0000000000000..7c362527da205
--- /dev/null
+++ b/tools/objtool/arch/x86/include/arch/endianness.h
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef _ARCH_ENDIANNESS_H
+#define _ARCH_ENDIANNESS_H
+
+#include <endian.h>
+
+#define __TARGET_BYTE_ORDER __LITTLE_ENDIAN
+
+#endif /* _ARCH_ENDIANNESS_H */
diff --git a/tools/objtool/arch/x86/include/arch_special.h b/tools/objtool/arch/x86/include/arch/special.h
index d818b2bffa02c..d818b2bffa02c 100644
--- a/tools/objtool/arch/x86/include/arch_special.h
+++ b/tools/objtool/arch/x86/include/arch/special.h
diff --git a/tools/objtool/arch/x86/special.c b/tools/objtool/arch/x86/special.c
index fd4af88c0ea52..e707d9bcd1616 100644
--- a/tools/objtool/arch/x86/special.c
+++ b/tools/objtool/arch/x86/special.c
@@ -1,8 +1,8 @@
// SPDX-License-Identifier: GPL-2.0-or-later
#include <string.h>
-#include "../../special.h"
-#include "../../builtin.h"
+#include <objtool/special.h>
+#include <objtool/builtin.h>
#define X86_FEATURE_POPCNT (4 * 32 + 23)
#define X86_FEATURE_SMAP (9 * 32 + 20)
@@ -48,7 +48,7 @@ bool arch_support_alt_relocation(struct special_alt *special_alt,
* replacement group.
*/
return insn->offset == special_alt->new_off &&
- (insn->type == INSN_CALL || is_static_jump(insn));
+ (insn->type == INSN_CALL || is_jump(insn));
}
/*
diff --git a/tools/objtool/builtin-check.c b/tools/objtool/builtin-check.c
index c6d199bfd0ae2..f47951e19c9d3 100644
--- a/tools/objtool/builtin-check.c
+++ b/tools/objtool/builtin-check.c
@@ -15,8 +15,8 @@
#include <subcmd/parse-options.h>
#include <string.h>
-#include "builtin.h"
-#include "objtool.h"
+#include <objtool/builtin.h>
+#include <objtool/objtool.h>
bool no_fp, no_unreachable, retpoline, module, backtrace, uaccess, stats, validate_dup, vmlinux;
diff --git a/tools/objtool/builtin-orc.c b/tools/objtool/builtin-orc.c
index 7b31121fa60b2..8273bbf7cebb1 100644
--- a/tools/objtool/builtin-orc.c
+++ b/tools/objtool/builtin-orc.c
@@ -13,8 +13,8 @@
*/
#include <string.h>
-#include "builtin.h"
-#include "objtool.h"
+#include <objtool/builtin.h>
+#include <objtool/objtool.h>
static const char *orc_usage[] = {
"objtool orc generate [<options>] file.o",
@@ -51,11 +51,7 @@ int cmd_orc(int argc, const char **argv)
if (list_empty(&file->insn_list))
return 0;
- ret = create_orc(file);
- if (ret)
- return ret;
-
- ret = create_orc_sections(file);
+ ret = orc_create(file);
if (ret)
return ret;
diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index 4bd30315eb62b..331a763d87756 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -6,21 +6,20 @@
#include <string.h>
#include <stdlib.h>
-#include "builtin.h"
-#include "cfi.h"
-#include "arch.h"
-#include "check.h"
-#include "special.h"
-#include "warn.h"
-#include "arch_elf.h"
+#include <arch/elf.h>
+#include <objtool/builtin.h>
+#include <objtool/cfi.h>
+#include <objtool/arch.h>
+#include <objtool/check.h>
+#include <objtool/special.h>
+#include <objtool/warn.h>
+#include <objtool/endianness.h>
#include <linux/objtool.h>
#include <linux/hashtable.h>
#include <linux/kernel.h>
#include <linux/static_call_types.h>
-#define FAKE_JUMP_OFFSET -1
-
struct alternative {
struct list_head list;
struct instruction *insn;
@@ -111,15 +110,20 @@ static struct instruction *prev_insn_same_sym(struct objtool_file *file,
static bool is_sibling_call(struct instruction *insn)
{
+ /*
+ * Assume only ELF functions can make sibling calls. This ensures
+ * sibling call detection consistency between vmlinux.o and individual
+ * objects.
+ */
+ if (!insn->func)
+ return false;
+
/* An indirect jump is either a sibling call or a jump to a table. */
if (insn->type == INSN_JUMP_DYNAMIC)
return list_empty(&insn->alts);
- if (!is_static_jump(insn))
- return false;
-
/* add_jump_destinations() sets insn->call_dest for sibling calls. */
- return !!insn->call_dest;
+ return (is_static_jump(insn) && insn->call_dest);
}
/*
@@ -156,6 +160,7 @@ static bool __dead_end_function(struct objtool_file *file, struct symbol *func,
"machine_real_restart",
"rewind_stack_do_exit",
"kunit_try_catch_throw",
+ "xen_start_kernel",
};
if (!func)
@@ -502,8 +507,21 @@ static int create_static_call_sections(struct objtool_file *file)
key_sym = find_symbol_by_name(file->elf, tmp);
if (!key_sym) {
- WARN("static_call: can't find static_call_key symbol: %s", tmp);
- return -1;
+ if (!module) {
+ WARN("static_call: can't find static_call_key symbol: %s", tmp);
+ return -1;
+ }
+
+ /*
+ * For modules(), the key might not be exported, which
+ * means the module can make static calls but isn't
+ * allowed to change them.
+ *
+ * In that case we temporarily set the key to be the
+ * trampoline address. This is fixed up in
+ * static_call_add_module().
+ */
+ key_sym = insn->call_dest;
}
free(key_name);
@@ -774,22 +792,16 @@ static int add_jump_destinations(struct objtool_file *file)
if (!is_static_jump(insn))
continue;
- if (insn->offset == FAKE_JUMP_OFFSET)
- continue;
-
reloc = find_reloc_by_dest_range(file->elf, insn->sec,
- insn->offset, insn->len);
+ insn->offset, insn->len);
if (!reloc) {
dest_sec = insn->sec;
dest_off = arch_jump_destination(insn);
} else if (reloc->sym->type == STT_SECTION) {
dest_sec = reloc->sym->sec;
dest_off = arch_dest_reloc_offset(reloc->addend);
- } else if (reloc->sym->sec->idx) {
- dest_sec = reloc->sym->sec;
- dest_off = reloc->sym->sym.st_value +
- arch_dest_reloc_offset(reloc->addend);
- } else if (strstr(reloc->sym->name, "_indirect_thunk_")) {
+ } else if (!strncmp(reloc->sym->name, "__x86_indirect_thunk_", 21) ||
+ !strncmp(reloc->sym->name, "__x86_retpoline_", 16)) {
/*
* Retpoline jumps are really dynamic jumps in
* disguise, so convert them accordingly.
@@ -801,14 +813,21 @@ static int add_jump_destinations(struct objtool_file *file)
insn->retpoline_safe = true;
continue;
- } else {
- /* external sibling call */
+ } else if (insn->func) {
+ /* internal or external sibling call (with reloc) */
insn->call_dest = reloc->sym;
if (insn->call_dest->static_call_tramp) {
list_add_tail(&insn->static_call_node,
&file->static_call_list);
}
continue;
+ } else if (reloc->sym->sec->idx) {
+ dest_sec = reloc->sym->sec;
+ dest_off = reloc->sym->sym.st_value +
+ arch_dest_reloc_offset(reloc->addend);
+ } else {
+ /* non-func asm code jumping to another file */
+ continue;
}
insn->jump_dest = find_insn(file, dest_sec, dest_off);
@@ -849,15 +868,15 @@ static int add_jump_destinations(struct objtool_file *file)
* case where the parent function's only reference to a
* subfunction is through a jump table.
*/
- if (!strstr(insn->func->name, ".cold.") &&
- strstr(insn->jump_dest->func->name, ".cold.")) {
+ if (!strstr(insn->func->name, ".cold") &&
+ strstr(insn->jump_dest->func->name, ".cold")) {
insn->func->cfunc = insn->jump_dest->func;
insn->jump_dest->func->pfunc = insn->func;
} else if (insn->jump_dest->func->pfunc != insn->func->pfunc &&
insn->jump_dest->offset == insn->jump_dest->func->offset) {
- /* internal sibling call */
+ /* internal sibling call (without reloc) */
insn->call_dest = insn->jump_dest->func;
if (insn->call_dest->static_call_tramp) {
list_add_tail(&insn->static_call_node,
@@ -970,73 +989,83 @@ static int add_call_destinations(struct objtool_file *file)
}
/*
- * The .alternatives section requires some extra special care, over and above
- * what other special sections require:
- *
- * 1. Because alternatives are patched in-place, we need to insert a fake jump
- * instruction at the end so that validate_branch() skips all the original
- * replaced instructions when validating the new instruction path.
- *
- * 2. An added wrinkle is that the new instruction length might be zero. In
- * that case the old instructions are replaced with noops. We simulate that
- * by creating a fake jump as the only new instruction.
- *
- * 3. In some cases, the alternative section includes an instruction which
- * conditionally jumps to the _end_ of the entry. We have to modify these
- * jumps' destinations to point back to .text rather than the end of the
- * entry in .altinstr_replacement.
+ * The .alternatives section requires some extra special care over and above
+ * other special sections because alternatives are patched in place.
*/
static int handle_group_alt(struct objtool_file *file,
struct special_alt *special_alt,
struct instruction *orig_insn,
struct instruction **new_insn)
{
- static unsigned int alt_group_next_index = 1;
- struct instruction *last_orig_insn, *last_new_insn, *insn, *fake_jump = NULL;
- unsigned int alt_group = alt_group_next_index++;
+ struct instruction *last_orig_insn, *last_new_insn = NULL, *insn, *nop = NULL;
+ struct alt_group *orig_alt_group, *new_alt_group;
unsigned long dest_off;
+
+ orig_alt_group = malloc(sizeof(*orig_alt_group));
+ if (!orig_alt_group) {
+ WARN("malloc failed");
+ return -1;
+ }
+ orig_alt_group->cfi = calloc(special_alt->orig_len,
+ sizeof(struct cfi_state *));
+ if (!orig_alt_group->cfi) {
+ WARN("calloc failed");
+ return -1;
+ }
+
last_orig_insn = NULL;
insn = orig_insn;
sec_for_each_insn_from(file, insn) {
if (insn->offset >= special_alt->orig_off + special_alt->orig_len)
break;
- insn->alt_group = alt_group;
+ insn->alt_group = orig_alt_group;
last_orig_insn = insn;
}
+ orig_alt_group->orig_group = NULL;
+ orig_alt_group->first_insn = orig_insn;
+ orig_alt_group->last_insn = last_orig_insn;
- if (next_insn_same_sec(file, last_orig_insn)) {
- fake_jump = malloc(sizeof(*fake_jump));
- if (!fake_jump) {
+
+ new_alt_group = malloc(sizeof(*new_alt_group));
+ if (!new_alt_group) {
+ WARN("malloc failed");
+ return -1;
+ }
+
+ if (special_alt->new_len < special_alt->orig_len) {
+ /*
+ * Insert a fake nop at the end to make the replacement
+ * alt_group the same size as the original. This is needed to
+ * allow propagate_alt_cfi() to do its magic. When the last
+ * instruction affects the stack, the instruction after it (the
+ * nop) will propagate the new state to the shared CFI array.
+ */
+ nop = malloc(sizeof(*nop));
+ if (!nop) {
WARN("malloc failed");
return -1;
}
- memset(fake_jump, 0, sizeof(*fake_jump));
- INIT_LIST_HEAD(&fake_jump->alts);
- INIT_LIST_HEAD(&fake_jump->stack_ops);
- init_cfi_state(&fake_jump->cfi);
+ memset(nop, 0, sizeof(*nop));
+ INIT_LIST_HEAD(&nop->alts);
+ INIT_LIST_HEAD(&nop->stack_ops);
+ init_cfi_state(&nop->cfi);
- fake_jump->sec = special_alt->new_sec;
- fake_jump->offset = FAKE_JUMP_OFFSET;
- fake_jump->type = INSN_JUMP_UNCONDITIONAL;
- fake_jump->jump_dest = list_next_entry(last_orig_insn, list);
- fake_jump->func = orig_insn->func;
+ nop->sec = special_alt->new_sec;
+ nop->offset = special_alt->new_off + special_alt->new_len;
+ nop->len = special_alt->orig_len - special_alt->new_len;
+ nop->type = INSN_NOP;
+ nop->func = orig_insn->func;
+ nop->alt_group = new_alt_group;
+ nop->ignore = orig_insn->ignore_alts;
}
if (!special_alt->new_len) {
- if (!fake_jump) {
- WARN("%s: empty alternative at end of section",
- special_alt->orig_sec->name);
- return -1;
- }
-
- *new_insn = fake_jump;
- return 0;
+ *new_insn = nop;
+ goto end;
}
- last_new_insn = NULL;
- alt_group = alt_group_next_index++;
insn = *new_insn;
sec_for_each_insn_from(file, insn) {
struct reloc *alt_reloc;
@@ -1048,7 +1077,7 @@ static int handle_group_alt(struct objtool_file *file,
insn->ignore = orig_insn->ignore_alts;
insn->func = orig_insn->func;
- insn->alt_group = alt_group;
+ insn->alt_group = new_alt_group;
/*
* Since alternative replacement code is copy/pasted by the
@@ -1075,14 +1104,8 @@ static int handle_group_alt(struct objtool_file *file,
continue;
dest_off = arch_jump_destination(insn);
- if (dest_off == special_alt->new_off + special_alt->new_len) {
- if (!fake_jump) {
- WARN("%s: alternative jump to end of section",
- special_alt->orig_sec->name);
- return -1;
- }
- insn->jump_dest = fake_jump;
- }
+ if (dest_off == special_alt->new_off + special_alt->new_len)
+ insn->jump_dest = next_insn_same_sec(file, last_orig_insn);
if (!insn->jump_dest) {
WARN_FUNC("can't find alternative jump destination",
@@ -1097,9 +1120,13 @@ static int handle_group_alt(struct objtool_file *file,
return -1;
}
- if (fake_jump)
- list_add(&fake_jump->list, &last_new_insn->list);
-
+ if (nop)
+ list_add(&nop->list, &last_new_insn->list);
+end:
+ new_alt_group->orig_group = orig_alt_group;
+ new_alt_group->first_insn = *new_insn;
+ new_alt_group->last_insn = nop ? : last_new_insn;
+ new_alt_group->cfi = orig_alt_group->cfi;
return 0;
}
@@ -1391,13 +1418,20 @@ static int add_jump_table_alts(struct objtool_file *file)
return 0;
}
+static void set_func_state(struct cfi_state *state)
+{
+ state->cfa = initial_func_cfi.cfa;
+ memcpy(&state->regs, &initial_func_cfi.regs,
+ CFI_NUM_REGS * sizeof(struct cfi_reg));
+ state->stack_size = initial_func_cfi.cfa.offset;
+}
+
static int read_unwind_hints(struct objtool_file *file)
{
struct section *sec, *relocsec;
struct reloc *reloc;
struct unwind_hint *hint;
struct instruction *insn;
- struct cfi_reg *cfa;
int i;
sec = find_section_by_name(file->elf, ".discard.unwind_hints");
@@ -1432,22 +1466,20 @@ static int read_unwind_hints(struct objtool_file *file)
return -1;
}
- cfa = &insn->cfi.cfa;
+ insn->hint = true;
- if (hint->type == UNWIND_HINT_TYPE_RET_OFFSET) {
- insn->ret_offset = hint->sp_offset;
+ if (hint->type == UNWIND_HINT_TYPE_FUNC) {
+ set_func_state(&insn->cfi);
continue;
}
- insn->hint = true;
-
if (arch_decode_hint_reg(insn, hint->sp_reg)) {
WARN_FUNC("unsupported unwind_hint sp base reg %d",
insn->sec, insn->offset, hint->sp_reg);
return -1;
}
- cfa->offset = hint->sp_offset;
+ insn->cfi.cfa.offset = bswap_if_needed(hint->sp_offset);
insn->cfi.type = hint->type;
insn->cfi.end = hint->end;
}
@@ -1703,27 +1735,18 @@ static bool is_fentry_call(struct instruction *insn)
static bool has_modified_stack_frame(struct instruction *insn, struct insn_state *state)
{
- u8 ret_offset = insn->ret_offset;
struct cfi_state *cfi = &state->cfi;
int i;
if (cfi->cfa.base != initial_func_cfi.cfa.base || cfi->drap)
return true;
- if (cfi->cfa.offset != initial_func_cfi.cfa.offset + ret_offset)
+ if (cfi->cfa.offset != initial_func_cfi.cfa.offset)
return true;
- if (cfi->stack_size != initial_func_cfi.cfa.offset + ret_offset)
+ if (cfi->stack_size != initial_func_cfi.cfa.offset)
return true;
- /*
- * If there is a ret offset hint then don't check registers
- * because a callee-saved register might have been pushed on
- * the stack.
- */
- if (ret_offset)
- return false;
-
for (i = 0; i < CFI_NUM_REGS; i++) {
if (cfi->regs[i].base != initial_func_cfi.regs[i].base ||
cfi->regs[i].offset != initial_func_cfi.regs[i].offset)
@@ -1733,12 +1756,20 @@ static bool has_modified_stack_frame(struct instruction *insn, struct insn_state
return false;
}
+static bool check_reg_frame_pos(const struct cfi_reg *reg,
+ int expected_offset)
+{
+ return reg->base == CFI_CFA &&
+ reg->offset == expected_offset;
+}
+
static bool has_valid_stack_frame(struct insn_state *state)
{
struct cfi_state *cfi = &state->cfi;
- if (cfi->cfa.base == CFI_BP && cfi->regs[CFI_BP].base == CFI_CFA &&
- cfi->regs[CFI_BP].offset == -16)
+ if (cfi->cfa.base == CFI_BP &&
+ check_reg_frame_pos(&cfi->regs[CFI_BP], -cfi->cfa.offset) &&
+ check_reg_frame_pos(&cfi->regs[CFI_RA], -cfi->cfa.offset + 8))
return true;
if (cfi->drap && cfi->regs[CFI_BP].base == CFI_BP)
@@ -1867,8 +1898,7 @@ static int update_cfi_state(struct instruction *insn, struct cfi_state *cfi,
case OP_SRC_REG:
if (op->src.reg == CFI_SP && op->dest.reg == CFI_BP &&
cfa->base == CFI_SP &&
- regs[CFI_BP].base == CFI_CFA &&
- regs[CFI_BP].offset == -cfa->offset) {
+ check_reg_frame_pos(&regs[CFI_BP], -cfa->offset)) {
/* mov %rsp, %rbp */
cfa->base = op->dest.reg;
@@ -1928,6 +1958,38 @@ static int update_cfi_state(struct instruction *insn, struct cfi_state *cfi,
cfa->offset = -cfi->vals[op->src.reg].offset;
cfi->stack_size = cfa->offset;
+ } else if (cfa->base == CFI_SP &&
+ cfi->vals[op->src.reg].base == CFI_SP_INDIRECT &&
+ cfi->vals[op->src.reg].offset == cfa->offset) {
+
+ /*
+ * Stack swizzle:
+ *
+ * 1: mov %rsp, (%[tos])
+ * 2: mov %[tos], %rsp
+ * ...
+ * 3: pop %rsp
+ *
+ * Where:
+ *
+ * 1 - places a pointer to the previous
+ * stack at the Top-of-Stack of the
+ * new stack.
+ *
+ * 2 - switches to the new stack.
+ *
+ * 3 - pops the Top-of-Stack to restore
+ * the original stack.
+ *
+ * Note: we set base to SP_INDIRECT
+ * here and preserve offset. Therefore
+ * when the unwinder reaches ToS it
+ * will dereference SP and then add the
+ * offset to find the next frame, IOW:
+ * (%rsp) + offset.
+ */
+ cfa->base = CFI_SP_INDIRECT;
+
} else {
cfa->base = CFI_UNDEFINED;
cfa->offset = 0;
@@ -1953,6 +2015,17 @@ static int update_cfi_state(struct instruction *insn, struct cfi_state *cfi,
break;
}
+ if (!cfi->drap && op->src.reg == CFI_SP &&
+ op->dest.reg == CFI_BP && cfa->base == CFI_SP &&
+ check_reg_frame_pos(&regs[CFI_BP], -cfa->offset + op->src.offset)) {
+
+ /* lea disp(%rsp), %rbp */
+ cfa->base = CFI_BP;
+ cfa->offset -= op->src.offset;
+ cfi->bp_scratch = false;
+ break;
+ }
+
if (op->src.reg == CFI_SP && cfa->base == CFI_SP) {
/* drap: lea disp(%rsp), %drap */
@@ -2019,6 +2092,13 @@ static int update_cfi_state(struct instruction *insn, struct cfi_state *cfi,
case OP_SRC_POP:
case OP_SRC_POPF:
+ if (op->dest.reg == CFI_SP && cfa->base == CFI_SP_INDIRECT) {
+
+ /* pop %rsp; # restore from a stack swizzle */
+ cfa->base = CFI_SP;
+ break;
+ }
+
if (!cfi->drap && op->dest.reg == cfa->base) {
/* pop %rbp */
@@ -2047,6 +2127,14 @@ static int update_cfi_state(struct instruction *insn, struct cfi_state *cfi,
break;
case OP_SRC_REG_INDIRECT:
+ if (!cfi->drap && op->dest.reg == cfa->base &&
+ op->dest.reg == CFI_BP) {
+
+ /* mov disp(%rsp), %rbp */
+ cfa->base = CFI_SP;
+ cfa->offset = cfi->stack_size;
+ }
+
if (cfi->drap && op->src.reg == CFI_BP &&
op->src.offset == cfi->drap_offset) {
@@ -2068,6 +2156,12 @@ static int update_cfi_state(struct instruction *insn, struct cfi_state *cfi,
/* mov disp(%rbp), %reg */
/* mov disp(%rsp), %reg */
restore_reg(cfi, op->dest.reg);
+
+ } else if (op->src.reg == CFI_SP &&
+ op->src.offset == regs[op->dest.reg].offset + cfi->stack_size) {
+
+ /* mov disp(%rsp), %reg */
+ restore_reg(cfi, op->dest.reg);
}
break;
@@ -2145,6 +2239,18 @@ static int update_cfi_state(struct instruction *insn, struct cfi_state *cfi,
/* mov reg, disp(%rsp) */
save_reg(cfi, op->src.reg, CFI_CFA,
op->dest.offset - cfi->cfa.offset);
+
+ } else if (op->dest.reg == CFI_SP) {
+
+ /* mov reg, disp(%rsp) */
+ save_reg(cfi, op->src.reg, CFI_CFA,
+ op->dest.offset - cfi->stack_size);
+
+ } else if (op->src.reg == CFI_SP && op->dest.offset == 0) {
+
+ /* mov %rsp, (%reg); # setup a stack swizzle. */
+ cfi->vals[op->dest.reg].base = CFI_SP_INDIRECT;
+ cfi->vals[op->dest.reg].offset = cfa->offset;
}
break;
@@ -2192,22 +2298,47 @@ static int update_cfi_state(struct instruction *insn, struct cfi_state *cfi,
return 0;
}
-static int handle_insn_ops(struct instruction *insn, struct insn_state *state)
+/*
+ * The stack layouts of alternatives instructions can sometimes diverge when
+ * they have stack modifications. That's fine as long as the potential stack
+ * layouts don't conflict at any given potential instruction boundary.
+ *
+ * Flatten the CFIs of the different alternative code streams (both original
+ * and replacement) into a single shared CFI array which can be used to detect
+ * conflicts and nicely feed a linear array of ORC entries to the unwinder.
+ */
+static int propagate_alt_cfi(struct objtool_file *file, struct instruction *insn)
{
- struct stack_op *op;
+ struct cfi_state **alt_cfi;
+ int group_off;
- list_for_each_entry(op, &insn->stack_ops, list) {
- struct cfi_state old_cfi = state->cfi;
- int res;
+ if (!insn->alt_group)
+ return 0;
- res = update_cfi_state(insn, &state->cfi, op);
- if (res)
- return res;
+ alt_cfi = insn->alt_group->cfi;
+ group_off = insn->offset - insn->alt_group->first_insn->offset;
- if (insn->alt_group && memcmp(&state->cfi, &old_cfi, sizeof(struct cfi_state))) {
- WARN_FUNC("alternative modifies stack", insn->sec, insn->offset);
+ if (!alt_cfi[group_off]) {
+ alt_cfi[group_off] = &insn->cfi;
+ } else {
+ if (memcmp(alt_cfi[group_off], &insn->cfi, sizeof(struct cfi_state))) {
+ WARN_FUNC("stack layout conflict in alternatives",
+ insn->sec, insn->offset);
return -1;
}
+ }
+
+ return 0;
+}
+
+static int handle_insn_ops(struct instruction *insn, struct insn_state *state)
+{
+ struct stack_op *op;
+
+ list_for_each_entry(op, &insn->stack_ops, list) {
+
+ if (update_cfi_state(insn, &state->cfi, op))
+ return 1;
if (op->dest.type == OP_DEST_PUSHF) {
if (!state->uaccess_stack) {
@@ -2397,28 +2528,20 @@ static int validate_return(struct symbol *func, struct instruction *insn, struct
return 0;
}
-/*
- * Alternatives should not contain any ORC entries, this in turn means they
- * should not contain any CFI ops, which implies all instructions should have
- * the same same CFI state.
- *
- * It is possible to constuct alternatives that have unreachable holes that go
- * unreported (because they're NOPs), such holes would result in CFI_UNDEFINED
- * states which then results in ORC entries, which we just said we didn't want.
- *
- * Avoid them by copying the CFI entry of the first instruction into the whole
- * alternative.
- */
-static void fill_alternative_cfi(struct objtool_file *file, struct instruction *insn)
+static struct instruction *next_insn_to_validate(struct objtool_file *file,
+ struct instruction *insn)
{
- struct instruction *first_insn = insn;
- int alt_group = insn->alt_group;
+ struct alt_group *alt_group = insn->alt_group;
- sec_for_each_insn_continue(file, insn) {
- if (insn->alt_group != alt_group)
- break;
- insn->cfi = first_insn->cfi;
- }
+ /*
+ * Simulate the fact that alternatives are patched in-place. When the
+ * end of a replacement alt_group is reached, redirect objtool flow to
+ * the end of the original alt_group.
+ */
+ if (alt_group && insn == alt_group->last_insn && alt_group->orig_group)
+ return next_insn_same_sec(file, alt_group->orig_group->last_insn);
+
+ return next_insn_same_sec(file, insn);
}
/*
@@ -2439,7 +2562,7 @@ static int validate_branch(struct objtool_file *file, struct symbol *func,
sec = insn->sec;
while (1) {
- next_insn = next_insn_same_sec(file, insn);
+ next_insn = next_insn_to_validate(file, insn);
if (file->c_file && func && insn->func && func != insn->func->pfunc) {
WARN("%s() falls through to next function %s()",
@@ -2472,6 +2595,9 @@ static int validate_branch(struct objtool_file *file, struct symbol *func,
insn->visited |= visited;
+ if (propagate_alt_cfi(file, insn))
+ return 1;
+
if (!insn->ignore_alts && !list_empty(&insn->alts)) {
bool skip_orig = false;
@@ -2487,9 +2613,6 @@ static int validate_branch(struct objtool_file *file, struct symbol *func,
}
}
- if (insn->alt_group)
- fill_alternative_cfi(file, insn);
-
if (skip_orig)
return 0;
}
@@ -2527,7 +2650,7 @@ static int validate_branch(struct objtool_file *file, struct symbol *func,
case INSN_JUMP_CONDITIONAL:
case INSN_JUMP_UNCONDITIONAL:
- if (func && is_sibling_call(insn)) {
+ if (is_sibling_call(insn)) {
ret = validate_sibling_call(insn, &state);
if (ret)
return ret;
@@ -2549,7 +2672,7 @@ static int validate_branch(struct objtool_file *file, struct symbol *func,
case INSN_JUMP_DYNAMIC:
case INSN_JUMP_DYNAMIC_CONDITIONAL:
- if (func && is_sibling_call(insn)) {
+ if (is_sibling_call(insn)) {
ret = validate_sibling_call(insn, &state);
if (ret)
return ret;
@@ -2592,15 +2715,19 @@ static int validate_branch(struct objtool_file *file, struct symbol *func,
break;
case INSN_STD:
- if (state.df)
+ if (state.df) {
WARN_FUNC("recursive STD", sec, insn->offset);
+ return 1;
+ }
state.df = true;
break;
case INSN_CLD:
- if (!state.df && func)
+ if (!state.df && func) {
WARN_FUNC("redundant CLD", sec, insn->offset);
+ return 1;
+ }
state.df = false;
break;
@@ -2723,9 +2850,6 @@ static bool ignore_unreachable_insn(struct objtool_file *file, struct instructio
!strcmp(insn->sec->name, ".altinstr_aux"))
return true;
- if (insn->type == INSN_JUMP_UNCONDITIONAL && insn->offset == FAKE_JUMP_OFFSET)
- return true;
-
if (!insn->func)
return false;
@@ -2811,10 +2935,7 @@ static int validate_section(struct objtool_file *file, struct section *sec)
continue;
init_insn_state(&state, sec);
- state.cfi.cfa = initial_func_cfi.cfa;
- memcpy(&state.cfi.regs, &initial_func_cfi.regs,
- CFI_NUM_REGS * sizeof(struct cfi_reg));
- state.cfi.stack_size = initial_func_cfi.cfa.offset;
+ set_func_state(&state.cfi);
warnings += validate_symbol(file, sec, func, &state);
}
diff --git a/tools/objtool/elf.c b/tools/objtool/elf.c
index d8421e1d06bed..93fa833a49a5a 100644
--- a/tools/objtool/elf.c
+++ b/tools/objtool/elf.c
@@ -15,10 +15,10 @@
#include <string.h>
#include <unistd.h>
#include <errno.h>
-#include "builtin.h"
+#include <objtool/builtin.h>
-#include "elf.h"
-#include "warn.h"
+#include <objtool/elf.h>
+#include <objtool/warn.h>
#define MAX_NAME_LEN 128
@@ -43,75 +43,24 @@ static void elf_hash_init(struct hlist_head *table)
#define elf_hash_for_each_possible(name, obj, member, key) \
hlist_for_each_entry(obj, &name[hash_min(key, elf_hash_bits())], member)
-static void rb_add(struct rb_root *tree, struct rb_node *node,
- int (*cmp)(struct rb_node *, const struct rb_node *))
-{
- struct rb_node **link = &tree->rb_node;
- struct rb_node *parent = NULL;
-
- while (*link) {
- parent = *link;
- if (cmp(node, parent) < 0)
- link = &parent->rb_left;
- else
- link = &parent->rb_right;
- }
-
- rb_link_node(node, parent, link);
- rb_insert_color(node, tree);
-}
-
-static struct rb_node *rb_find_first(const struct rb_root *tree, const void *key,
- int (*cmp)(const void *key, const struct rb_node *))
-{
- struct rb_node *node = tree->rb_node;
- struct rb_node *match = NULL;
-
- while (node) {
- int c = cmp(key, node);
- if (c <= 0) {
- if (!c)
- match = node;
- node = node->rb_left;
- } else if (c > 0) {
- node = node->rb_right;
- }
- }
-
- return match;
-}
-
-static struct rb_node *rb_next_match(struct rb_node *node, const void *key,
- int (*cmp)(const void *key, const struct rb_node *))
-{
- node = rb_next(node);
- if (node && cmp(key, node))
- node = NULL;
- return node;
-}
-
-#define rb_for_each(tree, node, key, cmp) \
- for ((node) = rb_find_first((tree), (key), (cmp)); \
- (node); (node) = rb_next_match((node), (key), (cmp)))
-
-static int symbol_to_offset(struct rb_node *a, const struct rb_node *b)
+static bool symbol_to_offset(struct rb_node *a, const struct rb_node *b)
{
struct symbol *sa = rb_entry(a, struct symbol, node);
struct symbol *sb = rb_entry(b, struct symbol, node);
if (sa->offset < sb->offset)
- return -1;
+ return true;
if (sa->offset > sb->offset)
- return 1;
+ return false;
if (sa->len < sb->len)
- return -1;
+ return true;
if (sa->len > sb->len)
- return 1;
+ return false;
sa->alias = sb;
- return 0;
+ return false;
}
static int symbol_by_offset(const void *key, const struct rb_node *node)
@@ -165,7 +114,7 @@ struct symbol *find_symbol_by_offset(struct section *sec, unsigned long offset)
{
struct rb_node *node;
- rb_for_each(&sec->symbol_tree, node, &offset, symbol_by_offset) {
+ rb_for_each(node, &offset, &sec->symbol_tree, symbol_by_offset) {
struct symbol *s = rb_entry(node, struct symbol, node);
if (s->offset == offset && s->type != STT_SECTION)
@@ -179,7 +128,7 @@ struct symbol *find_func_by_offset(struct section *sec, unsigned long offset)
{
struct rb_node *node;
- rb_for_each(&sec->symbol_tree, node, &offset, symbol_by_offset) {
+ rb_for_each(node, &offset, &sec->symbol_tree, symbol_by_offset) {
struct symbol *s = rb_entry(node, struct symbol, node);
if (s->offset == offset && s->type == STT_FUNC)
@@ -193,7 +142,7 @@ struct symbol *find_symbol_containing(const struct section *sec, unsigned long o
{
struct rb_node *node;
- rb_for_each(&sec->symbol_tree, node, &offset, symbol_by_offset) {
+ rb_for_each(node, &offset, &sec->symbol_tree, symbol_by_offset) {
struct symbol *s = rb_entry(node, struct symbol, node);
if (s->type != STT_SECTION)
@@ -207,7 +156,7 @@ struct symbol *find_func_containing(struct section *sec, unsigned long offset)
{
struct rb_node *node;
- rb_for_each(&sec->symbol_tree, node, &offset, symbol_by_offset) {
+ rb_for_each(node, &offset, &sec->symbol_tree, symbol_by_offset) {
struct symbol *s = rb_entry(node, struct symbol, node);
if (s->type == STT_FUNC)
@@ -442,7 +391,7 @@ static int read_symbols(struct elf *elf)
sym->offset = sym->sym.st_value;
sym->len = sym->sym.st_size;
- rb_add(&sym->sec->symbol_tree, &sym->node, symbol_to_offset);
+ rb_add(&sym->node, &sym->sec->symbol_tree, symbol_to_offset);
pnode = rb_prev(&sym->node);
if (pnode)
entry = &rb_entry(pnode, struct symbol, node)->list;
@@ -865,25 +814,27 @@ static int elf_rebuild_rel_reloc_section(struct section *sec, int nr)
{
struct reloc *reloc;
int idx = 0, size;
- GElf_Rel *relocs;
+ void *buf;
/* Allocate a buffer for relocations */
- size = nr * sizeof(*relocs);
- relocs = malloc(size);
- if (!relocs) {
+ size = nr * sizeof(GElf_Rel);
+ buf = malloc(size);
+ if (!buf) {
perror("malloc");
return -1;
}
- sec->data->d_buf = relocs;
+ sec->data->d_buf = buf;
sec->data->d_size = size;
+ sec->data->d_type = ELF_T_REL;
sec->sh.sh_size = size;
idx = 0;
list_for_each_entry(reloc, &sec->reloc_list, list) {
- relocs[idx].r_offset = reloc->offset;
- relocs[idx].r_info = GELF_R_INFO(reloc->sym->idx, reloc->type);
+ reloc->rel.r_offset = reloc->offset;
+ reloc->rel.r_info = GELF_R_INFO(reloc->sym->idx, reloc->type);
+ gelf_update_rel(sec->data, idx, &reloc->rel);
idx++;
}
@@ -894,26 +845,28 @@ static int elf_rebuild_rela_reloc_section(struct section *sec, int nr)
{
struct reloc *reloc;
int idx = 0, size;
- GElf_Rela *relocs;
+ void *buf;
/* Allocate a buffer for relocations with addends */
- size = nr * sizeof(*relocs);
- relocs = malloc(size);
- if (!relocs) {
+ size = nr * sizeof(GElf_Rela);
+ buf = malloc(size);
+ if (!buf) {
perror("malloc");
return -1;
}
- sec->data->d_buf = relocs;
+ sec->data->d_buf = buf;
sec->data->d_size = size;
+ sec->data->d_type = ELF_T_RELA;
sec->sh.sh_size = size;
idx = 0;
list_for_each_entry(reloc, &sec->reloc_list, list) {
- relocs[idx].r_offset = reloc->offset;
- relocs[idx].r_addend = reloc->addend;
- relocs[idx].r_info = GELF_R_INFO(reloc->sym->idx, reloc->type);
+ reloc->rela.r_offset = reloc->offset;
+ reloc->rela.r_addend = reloc->addend;
+ reloc->rela.r_info = GELF_R_INFO(reloc->sym->idx, reloc->type);
+ gelf_update_rela(sec->data, idx, &reloc->rela);
idx++;
}
diff --git a/tools/objtool/arch.h b/tools/objtool/include/objtool/arch.h
index 4a84c3081b8e1..6ff0685f5cc5a 100644
--- a/tools/objtool/arch.h
+++ b/tools/objtool/include/objtool/arch.h
@@ -8,12 +8,8 @@
#include <stdbool.h>
#include <linux/list.h>
-#include "objtool.h"
-#include "cfi.h"
-
-#ifdef INSN_USE_ORC
-#include <asm/orc_types.h>
-#endif
+#include <objtool/objtool.h>
+#include <objtool/cfi.h>
enum insn_type {
INSN_JUMP_CONDITIONAL,
diff --git a/tools/objtool/builtin.h b/tools/objtool/include/objtool/builtin.h
index 85c979caa3677..85c979caa3677 100644
--- a/tools/objtool/builtin.h
+++ b/tools/objtool/include/objtool/builtin.h
diff --git a/tools/objtool/cfi.h b/tools/objtool/include/objtool/cfi.h
index c7c59c6a44eea..fd5cb0bed9bf0 100644
--- a/tools/objtool/cfi.h
+++ b/tools/objtool/include/objtool/cfi.h
@@ -6,7 +6,7 @@
#ifndef _OBJTOOL_CFI_H
#define _OBJTOOL_CFI_H
-#include "cfi_regs.h"
+#include <arch/cfi_regs.h>
#define CFI_UNDEFINED -1
#define CFI_CFA -2
diff --git a/tools/objtool/check.h b/tools/objtool/include/objtool/check.h
index 5ec00a4b891b6..4891ead0e85f6 100644
--- a/tools/objtool/check.h
+++ b/tools/objtool/include/objtool/check.h
@@ -7,8 +7,8 @@
#define _CHECK_H
#include <stdbool.h>
-#include "cfi.h"
-#include "arch.h"
+#include <objtool/cfi.h>
+#include <objtool/arch.h>
struct insn_state {
struct cfi_state cfi;
@@ -19,6 +19,23 @@ struct insn_state {
s8 instr;
};
+struct alt_group {
+ /*
+ * Pointer from a replacement group to the original group. NULL if it
+ * *is* the original group.
+ */
+ struct alt_group *orig_group;
+
+ /* First and last instructions in the group */
+ struct instruction *first_insn, *last_insn;
+
+ /*
+ * Byte-offset-addressed len-sized array of pointers to CFI structs.
+ * This is shared with the other alt_groups in the same alternative.
+ */
+ struct cfi_state **cfi;
+};
+
struct instruction {
struct list_head list;
struct hlist_node hash;
@@ -33,8 +50,7 @@ struct instruction {
bool retpoline_safe;
s8 instr;
u8 visited;
- u8 ret_offset;
- int alt_group;
+ struct alt_group *alt_group;
struct symbol *call_dest;
struct instruction *jump_dest;
struct instruction *first_jump_src;
@@ -43,9 +59,6 @@ struct instruction {
struct symbol *func;
struct list_head stack_ops;
struct cfi_state cfi;
-#ifdef INSN_USE_ORC
- struct orc_entry orc;
-#endif
};
static inline bool is_static_jump(struct instruction *insn)
@@ -54,6 +67,17 @@ static inline bool is_static_jump(struct instruction *insn)
insn->type == INSN_JUMP_UNCONDITIONAL;
}
+static inline bool is_dynamic_jump(struct instruction *insn)
+{
+ return insn->type == INSN_JUMP_DYNAMIC ||
+ insn->type == INSN_JUMP_DYNAMIC_CONDITIONAL;
+}
+
+static inline bool is_jump(struct instruction *insn)
+{
+ return is_static_jump(insn) || is_dynamic_jump(insn);
+}
+
struct instruction *find_insn(struct objtool_file *file,
struct section *sec, unsigned long offset);
diff --git a/tools/objtool/elf.h b/tools/objtool/include/objtool/elf.h
index e6890cc70a25b..e6890cc70a25b 100644
--- a/tools/objtool/elf.h
+++ b/tools/objtool/include/objtool/elf.h
diff --git a/tools/objtool/include/objtool/endianness.h b/tools/objtool/include/objtool/endianness.h
new file mode 100644
index 0000000000000..10241341eff35
--- /dev/null
+++ b/tools/objtool/include/objtool/endianness.h
@@ -0,0 +1,38 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef _OBJTOOL_ENDIANNESS_H
+#define _OBJTOOL_ENDIANNESS_H
+
+#include <arch/endianness.h>
+#include <linux/kernel.h>
+#include <endian.h>
+
+#ifndef __TARGET_BYTE_ORDER
+#error undefined arch __TARGET_BYTE_ORDER
+#endif
+
+#if __BYTE_ORDER != __TARGET_BYTE_ORDER
+#define __NEED_BSWAP 1
+#else
+#define __NEED_BSWAP 0
+#endif
+
+/*
+ * Does a byte swap if target endianness doesn't match the host, i.e. cross
+ * compilation for little endian on big endian and vice versa.
+ * To be used for multi-byte values conversion, which are read from / about
+ * to be written to a target native endianness ELF file.
+ */
+#define bswap_if_needed(val) \
+({ \
+ __typeof__(val) __ret; \
+ switch (sizeof(val)) { \
+ case 8: __ret = __NEED_BSWAP ? bswap_64(val) : (val); break; \
+ case 4: __ret = __NEED_BSWAP ? bswap_32(val) : (val); break; \
+ case 2: __ret = __NEED_BSWAP ? bswap_16(val) : (val); break; \
+ default: \
+ BUILD_BUG(); break; \
+ } \
+ __ret; \
+})
+
+#endif /* _OBJTOOL_ENDIANNESS_H */
diff --git a/tools/objtool/objtool.h b/tools/objtool/include/objtool/objtool.h
index 4125d4578b23b..e114642efb652 100644
--- a/tools/objtool/objtool.h
+++ b/tools/objtool/include/objtool/objtool.h
@@ -10,7 +10,7 @@
#include <linux/list.h>
#include <linux/hashtable.h>
-#include "elf.h"
+#include <objtool/elf.h>
#define __weak __attribute__((weak))
@@ -26,7 +26,6 @@ struct objtool_file *objtool_open_read(const char *_objname);
int check(struct objtool_file *file);
int orc_dump(const char *objname);
-int create_orc(struct objtool_file *file);
-int create_orc_sections(struct objtool_file *file);
+int orc_create(struct objtool_file *file);
#endif /* _OBJTOOL_H */
diff --git a/tools/objtool/special.h b/tools/objtool/include/objtool/special.h
index abddf38ef3346..8a09f4e9d480e 100644
--- a/tools/objtool/special.h
+++ b/tools/objtool/include/objtool/special.h
@@ -7,8 +7,8 @@
#define _SPECIAL_H
#include <stdbool.h>
-#include "check.h"
-#include "elf.h"
+#include <objtool/check.h>
+#include <objtool/elf.h>
#define C_JUMP_TABLE_SECTION ".rodata..c_jump_table"
diff --git a/tools/objtool/warn.h b/tools/objtool/include/objtool/warn.h
index 7799f60de80af..d99c4675e4a5f 100644
--- a/tools/objtool/warn.h
+++ b/tools/objtool/include/objtool/warn.h
@@ -11,7 +11,7 @@
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
-#include "elf.h"
+#include <objtool/elf.h>
extern const char *objname;
diff --git a/tools/objtool/objtool.c b/tools/objtool/objtool.c
index 9df0cd86d310d..e848feb0a5fc6 100644
--- a/tools/objtool/objtool.c
+++ b/tools/objtool/objtool.c
@@ -21,9 +21,9 @@
#include <subcmd/pager.h>
#include <linux/kernel.h>
-#include "builtin.h"
-#include "objtool.h"
-#include "warn.h"
+#include <objtool/builtin.h>
+#include <objtool/objtool.h>
+#include <objtool/warn.h>
struct cmd_struct {
const char *name;
diff --git a/tools/objtool/orc_dump.c b/tools/objtool/orc_dump.c
index 5e6a95368d351..f5a8508c42d6d 100644
--- a/tools/objtool/orc_dump.c
+++ b/tools/objtool/orc_dump.c
@@ -6,8 +6,9 @@
#include <unistd.h>
#include <linux/objtool.h>
#include <asm/orc_types.h>
-#include "objtool.h"
-#include "warn.h"
+#include <objtool/objtool.h>
+#include <objtool/warn.h>
+#include <objtool/endianness.h>
static const char *reg_name(unsigned int reg)
{
@@ -54,7 +55,7 @@ static void print_reg(unsigned int reg, int offset)
if (reg == ORC_REG_BP_INDIRECT)
printf("(bp%+d)", offset);
else if (reg == ORC_REG_SP_INDIRECT)
- printf("(sp%+d)", offset);
+ printf("(sp)%+d", offset);
else if (reg == ORC_REG_UNDEFINED)
printf("(und)");
else
@@ -197,11 +198,11 @@ int orc_dump(const char *_objname)
printf(" sp:");
- print_reg(orc[i].sp_reg, orc[i].sp_offset);
+ print_reg(orc[i].sp_reg, bswap_if_needed(orc[i].sp_offset));
printf(" bp:");
- print_reg(orc[i].bp_reg, orc[i].bp_offset);
+ print_reg(orc[i].bp_reg, bswap_if_needed(orc[i].bp_offset));
printf(" type:%s end:%d\n",
orc_type_name(orc[i].type), orc[i].end);
diff --git a/tools/objtool/orc_gen.c b/tools/objtool/orc_gen.c
index 9ce68b385a1b8..738aa5021bc4b 100644
--- a/tools/objtool/orc_gen.c
+++ b/tools/objtool/orc_gen.c
@@ -9,93 +9,91 @@
#include <linux/objtool.h>
#include <asm/orc_types.h>
-#include "check.h"
-#include "warn.h"
+#include <objtool/check.h>
+#include <objtool/warn.h>
+#include <objtool/endianness.h>
-int create_orc(struct objtool_file *file)
+static int init_orc_entry(struct orc_entry *orc, struct cfi_state *cfi)
{
- struct instruction *insn;
+ struct instruction *insn = container_of(cfi, struct instruction, cfi);
+ struct cfi_reg *bp = &cfi->regs[CFI_BP];
- for_each_insn(file, insn) {
- struct orc_entry *orc = &insn->orc;
- struct cfi_reg *cfa = &insn->cfi.cfa;
- struct cfi_reg *bp = &insn->cfi.regs[CFI_BP];
+ memset(orc, 0, sizeof(*orc));
- if (!insn->sec->text)
- continue;
-
- orc->end = insn->cfi.end;
-
- if (cfa->base == CFI_UNDEFINED) {
- orc->sp_reg = ORC_REG_UNDEFINED;
- continue;
- }
+ orc->end = cfi->end;
- switch (cfa->base) {
- case CFI_SP:
- orc->sp_reg = ORC_REG_SP;
- break;
- case CFI_SP_INDIRECT:
- orc->sp_reg = ORC_REG_SP_INDIRECT;
- break;
- case CFI_BP:
- orc->sp_reg = ORC_REG_BP;
- break;
- case CFI_BP_INDIRECT:
- orc->sp_reg = ORC_REG_BP_INDIRECT;
- break;
- case CFI_R10:
- orc->sp_reg = ORC_REG_R10;
- break;
- case CFI_R13:
- orc->sp_reg = ORC_REG_R13;
- break;
- case CFI_DI:
- orc->sp_reg = ORC_REG_DI;
- break;
- case CFI_DX:
- orc->sp_reg = ORC_REG_DX;
- break;
- default:
- WARN_FUNC("unknown CFA base reg %d",
- insn->sec, insn->offset, cfa->base);
- return -1;
- }
+ if (cfi->cfa.base == CFI_UNDEFINED) {
+ orc->sp_reg = ORC_REG_UNDEFINED;
+ return 0;
+ }
- switch(bp->base) {
- case CFI_UNDEFINED:
- orc->bp_reg = ORC_REG_UNDEFINED;
- break;
- case CFI_CFA:
- orc->bp_reg = ORC_REG_PREV_SP;
- break;
- case CFI_BP:
- orc->bp_reg = ORC_REG_BP;
- break;
- default:
- WARN_FUNC("unknown BP base reg %d",
- insn->sec, insn->offset, bp->base);
- return -1;
- }
+ switch (cfi->cfa.base) {
+ case CFI_SP:
+ orc->sp_reg = ORC_REG_SP;
+ break;
+ case CFI_SP_INDIRECT:
+ orc->sp_reg = ORC_REG_SP_INDIRECT;
+ break;
+ case CFI_BP:
+ orc->sp_reg = ORC_REG_BP;
+ break;
+ case CFI_BP_INDIRECT:
+ orc->sp_reg = ORC_REG_BP_INDIRECT;
+ break;
+ case CFI_R10:
+ orc->sp_reg = ORC_REG_R10;
+ break;
+ case CFI_R13:
+ orc->sp_reg = ORC_REG_R13;
+ break;
+ case CFI_DI:
+ orc->sp_reg = ORC_REG_DI;
+ break;
+ case CFI_DX:
+ orc->sp_reg = ORC_REG_DX;
+ break;
+ default:
+ WARN_FUNC("unknown CFA base reg %d",
+ insn->sec, insn->offset, cfi->cfa.base);
+ return -1;
+ }
- orc->sp_offset = cfa->offset;
- orc->bp_offset = bp->offset;
- orc->type = insn->cfi.type;
+ switch (bp->base) {
+ case CFI_UNDEFINED:
+ orc->bp_reg = ORC_REG_UNDEFINED;
+ break;
+ case CFI_CFA:
+ orc->bp_reg = ORC_REG_PREV_SP;
+ break;
+ case CFI_BP:
+ orc->bp_reg = ORC_REG_BP;
+ break;
+ default:
+ WARN_FUNC("unknown BP base reg %d",
+ insn->sec, insn->offset, bp->base);
+ return -1;
}
+ orc->sp_offset = cfi->cfa.offset;
+ orc->bp_offset = bp->offset;
+ orc->type = cfi->type;
+
return 0;
}
-static int create_orc_entry(struct elf *elf, struct section *u_sec, struct section *ip_relocsec,
- unsigned int idx, struct section *insn_sec,
- unsigned long insn_off, struct orc_entry *o)
+static int write_orc_entry(struct elf *elf, struct section *orc_sec,
+ struct section *ip_rsec, unsigned int idx,
+ struct section *insn_sec, unsigned long insn_off,
+ struct orc_entry *o)
{
struct orc_entry *orc;
struct reloc *reloc;
/* populate ORC data */
- orc = (struct orc_entry *)u_sec->data->d_buf + idx;
+ orc = (struct orc_entry *)orc_sec->data->d_buf + idx;
memcpy(orc, o, sizeof(*orc));
+ orc->sp_offset = bswap_if_needed(orc->sp_offset);
+ orc->bp_offset = bswap_if_needed(orc->bp_offset);
/* populate reloc for ip */
reloc = malloc(sizeof(*reloc));
@@ -114,102 +112,149 @@ static int create_orc_entry(struct elf *elf, struct section *u_sec, struct secti
reloc->type = R_X86_64_PC32;
reloc->offset = idx * sizeof(int);
- reloc->sec = ip_relocsec;
+ reloc->sec = ip_rsec;
elf_add_reloc(elf, reloc);
return 0;
}
-int create_orc_sections(struct objtool_file *file)
-{
- struct instruction *insn, *prev_insn;
- struct section *sec, *u_sec, *ip_relocsec;
- unsigned int idx;
+struct orc_list_entry {
+ struct list_head list;
+ struct orc_entry orc;
+ struct section *insn_sec;
+ unsigned long insn_off;
+};
- struct orc_entry empty = {
- .sp_reg = ORC_REG_UNDEFINED,
- .bp_reg = ORC_REG_UNDEFINED,
- .type = UNWIND_HINT_TYPE_CALL,
- };
+static int orc_list_add(struct list_head *orc_list, struct orc_entry *orc,
+ struct section *sec, unsigned long offset)
+{
+ struct orc_list_entry *entry = malloc(sizeof(*entry));
- sec = find_section_by_name(file->elf, ".orc_unwind");
- if (sec) {
- WARN("file already has .orc_unwind section, skipping");
+ if (!entry) {
+ WARN("malloc failed");
return -1;
}
- /* count the number of needed orcs */
- idx = 0;
- for_each_sec(file, sec) {
- if (!sec->text)
- continue;
-
- prev_insn = NULL;
- sec_for_each_insn(file, sec, insn) {
- if (!prev_insn ||
- memcmp(&insn->orc, &prev_insn->orc,
- sizeof(struct orc_entry))) {
- idx++;
- }
- prev_insn = insn;
- }
-
- /* section terminator */
- if (prev_insn)
- idx++;
- }
- if (!idx)
- return -1;
+ entry->orc = *orc;
+ entry->insn_sec = sec;
+ entry->insn_off = offset;
+ list_add_tail(&entry->list, orc_list);
+ return 0;
+}
- /* create .orc_unwind_ip and .rela.orc_unwind_ip sections */
- sec = elf_create_section(file->elf, ".orc_unwind_ip", 0, sizeof(int), idx);
- if (!sec)
- return -1;
+static unsigned long alt_group_len(struct alt_group *alt_group)
+{
+ return alt_group->last_insn->offset +
+ alt_group->last_insn->len -
+ alt_group->first_insn->offset;
+}
- ip_relocsec = elf_create_reloc_section(file->elf, sec, SHT_RELA);
- if (!ip_relocsec)
- return -1;
+int orc_create(struct objtool_file *file)
+{
+ struct section *sec, *ip_rsec, *orc_sec;
+ unsigned int nr = 0, idx = 0;
+ struct orc_list_entry *entry;
+ struct list_head orc_list;
- /* create .orc_unwind section */
- u_sec = elf_create_section(file->elf, ".orc_unwind", 0,
- sizeof(struct orc_entry), idx);
+ struct orc_entry null = {
+ .sp_reg = ORC_REG_UNDEFINED,
+ .bp_reg = ORC_REG_UNDEFINED,
+ .type = UNWIND_HINT_TYPE_CALL,
+ };
- /* populate sections */
- idx = 0;
+ /* Build a deduplicated list of ORC entries: */
+ INIT_LIST_HEAD(&orc_list);
for_each_sec(file, sec) {
+ struct orc_entry orc, prev_orc = {0};
+ struct instruction *insn;
+ bool empty = true;
+
if (!sec->text)
continue;
- prev_insn = NULL;
sec_for_each_insn(file, sec, insn) {
- if (!prev_insn || memcmp(&insn->orc, &prev_insn->orc,
- sizeof(struct orc_entry))) {
+ struct alt_group *alt_group = insn->alt_group;
+ int i;
- if (create_orc_entry(file->elf, u_sec, ip_relocsec, idx,
- insn->sec, insn->offset,
- &insn->orc))
+ if (!alt_group) {
+ if (init_orc_entry(&orc, &insn->cfi))
return -1;
+ if (!memcmp(&prev_orc, &orc, sizeof(orc)))
+ continue;
+ if (orc_list_add(&orc_list, &orc, sec,
+ insn->offset))
+ return -1;
+ nr++;
+ prev_orc = orc;
+ empty = false;
+ continue;
+ }
- idx++;
+ /*
+ * Alternatives can have different stack layout
+ * possibilities (but they shouldn't conflict).
+ * Instead of traversing the instructions, use the
+ * alt_group's flattened byte-offset-addressed CFI
+ * array.
+ */
+ for (i = 0; i < alt_group_len(alt_group); i++) {
+ struct cfi_state *cfi = alt_group->cfi[i];
+ if (!cfi)
+ continue;
+ if (init_orc_entry(&orc, cfi))
+ return -1;
+ if (!memcmp(&prev_orc, &orc, sizeof(orc)))
+ continue;
+ if (orc_list_add(&orc_list, &orc, insn->sec,
+ insn->offset + i))
+ return -1;
+ nr++;
+ prev_orc = orc;
+ empty = false;
}
- prev_insn = insn;
- }
- /* section terminator */
- if (prev_insn) {
- if (create_orc_entry(file->elf, u_sec, ip_relocsec, idx,
- prev_insn->sec,
- prev_insn->offset + prev_insn->len,
- &empty))
- return -1;
+ /* Skip to the end of the alt_group */
+ insn = alt_group->last_insn;
+ }
- idx++;
+ /* Add a section terminator */
+ if (!empty) {
+ orc_list_add(&orc_list, &null, sec, sec->len);
+ nr++;
}
}
+ if (!nr)
+ return 0;
+
+ /* Create .orc_unwind, .orc_unwind_ip and .rela.orc_unwind_ip sections: */
+ sec = find_section_by_name(file->elf, ".orc_unwind");
+ if (sec) {
+ WARN("file already has .orc_unwind section, skipping");
+ return -1;
+ }
+ orc_sec = elf_create_section(file->elf, ".orc_unwind", 0,
+ sizeof(struct orc_entry), nr);
+ if (!orc_sec)
+ return -1;
+
+ sec = elf_create_section(file->elf, ".orc_unwind_ip", 0, sizeof(int), nr);
+ if (!sec)
+ return -1;
+ ip_rsec = elf_create_reloc_section(file->elf, sec, SHT_RELA);
+ if (!ip_rsec)
+ return -1;
+
+ /* Write ORC entries to sections: */
+ list_for_each_entry(entry, &orc_list, list) {
+ if (write_orc_entry(file->elf, orc_sec, ip_rsec, idx++,
+ entry->insn_sec, entry->insn_off,
+ &entry->orc))
+ return -1;
+ }
- if (elf_rebuild_reloc_section(file->elf, ip_relocsec))
+ if (elf_rebuild_reloc_section(file->elf, ip_rsec))
return -1;
return 0;
diff --git a/tools/objtool/special.c b/tools/objtool/special.c
index 1a2420febd08a..2c7fbda7b0557 100644
--- a/tools/objtool/special.c
+++ b/tools/objtool/special.c
@@ -11,10 +11,11 @@
#include <stdlib.h>
#include <string.h>
-#include "builtin.h"
-#include "special.h"
-#include "warn.h"
-#include "arch_special.h"
+#include <arch/special.h>
+#include <objtool/builtin.h>
+#include <objtool/special.h>
+#include <objtool/warn.h>
+#include <objtool/endianness.h>
struct special_entry {
const char *sec;
@@ -77,8 +78,9 @@ static int get_alt_entry(struct elf *elf, struct special_entry *entry,
if (entry->feature) {
unsigned short feature;
- feature = *(unsigned short *)(sec->data->d_buf + offset +
- entry->feature);
+ feature = bswap_if_needed(*(unsigned short *)(sec->data->d_buf +
+ offset +
+ entry->feature));
arch_handle_alternative(feature, alt);
}
diff --git a/tools/objtool/weak.c b/tools/objtool/weak.c
index 7843e9a7a72f4..8314e824db4ae 100644
--- a/tools/objtool/weak.c
+++ b/tools/objtool/weak.c
@@ -7,7 +7,7 @@
#include <stdbool.h>
#include <errno.h>
-#include "objtool.h"
+#include <objtool/objtool.h>
#define UNSUPPORTED(name) \
({ \
@@ -25,12 +25,7 @@ int __weak orc_dump(const char *_objname)
UNSUPPORTED("orc");
}
-int __weak create_orc(struct objtool_file *file)
-{
- UNSUPPORTED("orc");
-}
-
-int __weak create_orc_sections(struct objtool_file *file)
+int __weak orc_create(struct objtool_file *file)
{
UNSUPPORTED("orc");
}
diff --git a/tools/testing/selftests/rcutorture/bin/config2csv.sh b/tools/testing/selftests/rcutorture/bin/config2csv.sh
new file mode 100755
index 0000000000000..d5a16631b16ee
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/bin/config2csv.sh
@@ -0,0 +1,67 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0+
+#
+# Create a spreadsheet from torture-test Kconfig options and kernel boot
+# parameters. Run this in the directory containing the scenario files.
+#
+# Usage: config2csv path.csv [ "scenario1 scenario2 ..." ]
+#
+# By default, this script will take the list of scenarios from the CFLIST
+# file in that directory, otherwise it will consider only the scenarios
+# specified on the command line. It will examine each scenario's file
+# and also its .boot file, if present, and create a column in the .csv
+# output file. Note that "CFLIST" is a synonym for all the scenarios in the
+# CFLIST file, which allows easy comparison of those scenarios with selected
+# scenarios such as BUSTED that are normally omitted from CFLIST files.
+
+csvout=${1}
+if test -z "$csvout"
+then
+ echo "Need .csv output file as first argument."
+ exit 1
+fi
+shift
+defaultconfigs="`tr '\012' ' ' < CFLIST`"
+if test "$#" -eq 0
+then
+ scenariosarg=$defaultconfigs
+else
+ scenariosarg=$*
+fi
+scenarios="`echo $scenariosarg | sed -e "s/\<CFLIST\>/$defaultconfigs/g"`"
+
+T=/tmp/config2latex.sh.$$
+trap 'rm -rf $T' 0
+mkdir $T
+
+cat << '---EOF---' >> $T/p.awk
+END {
+---EOF---
+for i in $scenarios
+do
+ echo ' s["'$i'"] = 1;' >> $T/p.awk
+ grep -v '^#' < $i | grep -v '^ *$' > $T/p
+ if test -r $i.boot
+ then
+ tr -s ' ' '\012' < $i.boot | grep -v '^#' >> $T/p
+ fi
+ sed -e 's/^[^=]*$/&=?/' < $T/p |
+ sed -e 's/^\([^=]*\)=\(.*\)$/\tp["\1:'"$i"'"] = "\2";\n\tc["\1"] = 1;/' >> $T/p.awk
+done
+cat << '---EOF---' >> $T/p.awk
+ ns = asorti(s, ss);
+ nc = asorti(c, cs);
+ for (j = 1; j <= ns; j++)
+ printf ",\"%s\"", ss[j];
+ printf "\n";
+ for (i = 1; i <= nc; i++) {
+ printf "\"%s\"", cs[i];
+ for (j = 1; j <= ns; j++) {
+ printf ",\"%s\"", p[cs[i] ":" ss[j]];
+ }
+ printf "\n";
+ }
+}
+---EOF---
+awk -f $T/p.awk < /dev/null > $T/p.csv
+cp $T/p.csv $csvout
diff --git a/tools/testing/selftests/rcutorture/bin/console-badness.sh b/tools/testing/selftests/rcutorture/bin/console-badness.sh
index 80ae7f08b363e..e6a132df61721 100755
--- a/tools/testing/selftests/rcutorture/bin/console-badness.sh
+++ b/tools/testing/selftests/rcutorture/bin/console-badness.sh
@@ -14,4 +14,5 @@ egrep 'Badness|WARNING:|Warn|BUG|===========|Call Trace:|Oops:|detected stalls o
grep -v 'ODEBUG: ' |
grep -v 'This means that this is a DEBUG kernel and it is' |
grep -v 'Warning: unable to open an initial console' |
+grep -v 'Warning: Failed to add ttynull console. No stdin, stdout, and stderr.*the init process!' |
grep -v 'NOHZ tick-stop error: Non-RCU local softirq work is pending, handler'
diff --git a/tools/testing/selftests/rcutorture/bin/functions.sh b/tools/testing/selftests/rcutorture/bin/functions.sh
index 82663495fb383..c35ba24f994c3 100644
--- a/tools/testing/selftests/rcutorture/bin/functions.sh
+++ b/tools/testing/selftests/rcutorture/bin/functions.sh
@@ -108,6 +108,39 @@ configfrag_hotplug_cpu () {
grep -q '^CONFIG_HOTPLUG_CPU=y$' "$1"
}
+# get_starttime
+#
+# Returns a cookie identifying the current time.
+get_starttime () {
+ awk 'BEGIN { print systime() }' < /dev/null
+}
+
+# get_starttime_duration starttime
+#
+# Given the return value from get_starttime, compute a human-readable
+# string denoting the time since get_starttime.
+get_starttime_duration () {
+ awk -v starttime=$1 '
+ BEGIN {
+ ts = systime() - starttime;
+ tm = int(ts / 60);
+ th = int(ts / 3600);
+ td = int(ts / 86400);
+ d = td;
+ h = th - td * 24;
+ m = tm - th * 60;
+ s = ts - tm * 60;
+ if (d >= 1)
+ printf "%dd %d:%02d:%02d\n", d, h, m, s
+ else if (h >= 1)
+ printf "%d:%02d:%02d\n", h, m, s
+ else if (m >= 1)
+ printf "%d:%02d.0\n", m, s
+ else
+ print s " seconds"
+ }' < /dev/null
+}
+
# identify_boot_image qemu-cmd
#
# Returns the relative path to the kernel build image. This will be
@@ -170,6 +203,7 @@ identify_qemu () {
# and the TORTURE_QEMU_INTERACTIVE environment variable.
identify_qemu_append () {
echo debug_boot_weak_hash
+ echo panic=-1
local console=ttyS0
case "$1" in
qemu-system-x86_64|qemu-system-i386)
@@ -232,7 +266,7 @@ identify_qemu_args () {
# Returns the number of virtual CPUs available to the aggregate of the
# guest OSes.
identify_qemu_vcpus () {
- lscpu | grep '^CPU(s):' | sed -e 's/CPU(s)://' -e 's/[ ]*//g'
+ getconf _NPROCESSORS_ONLN
}
# print_bug
diff --git a/tools/testing/selftests/rcutorture/bin/kvm-find-errors.sh b/tools/testing/selftests/rcutorture/bin/kvm-find-errors.sh
index 6f50722f251f8..0670841122d8a 100755
--- a/tools/testing/selftests/rcutorture/bin/kvm-find-errors.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm-find-errors.sh
@@ -39,12 +39,14 @@ done
if test -n "$files"
then
$editor $files
+ editorret=1
else
echo No build errors.
fi
if grep -q -e "--buildonly" < ${rundir}/log
then
echo Build-only run, no console logs to check.
+ exit $editorret
fi
# Find console logs with errors
@@ -62,5 +64,10 @@ then
exit 1
else
echo No errors in console logs.
- exit 0
+ if test -n "$editorret"
+ then
+ exit $editorret
+ else
+ exit 0
+ fi
fi
diff --git a/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh b/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh
index 840a4679a0d78..47cf4db10896c 100755
--- a/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh
@@ -87,15 +87,16 @@ do
fi
done
EDITOR=echo kvm-find-errors.sh "${@: -1}" > $T 2>&1
-ret=$?
builderrors="`tr ' ' '\012' < $T | grep -c '/Make.out.diags'`"
if test "$builderrors" -gt 0
then
echo $builderrors runs with build errors.
+ ret=1
fi
runerrors="`tr ' ' '\012' < $T | grep -c '/console.log.diags'`"
if test "$runerrors" -gt 0
then
echo $runerrors runs with runtime errors.
+ ret=2
fi
exit $ret
diff --git a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh
index 3cd03d01857cf..536d103ef1667 100755
--- a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh
@@ -125,7 +125,6 @@ seconds=$4
qemu_args=$5
boot_args=$6
-kstarttime=`gawk 'BEGIN { print systime() }' < /dev/null`
if test -z "$TORTURE_BUILDONLY"
then
echo ' ---' `date`: Starting kernel
@@ -158,6 +157,8 @@ then
boot_args="$boot_args $TORTURE_BOOT_GDB_ARG"
fi
echo $QEMU $qemu_args -m $TORTURE_QEMU_MEM -kernel $KERNEL -append \"$qemu_append $boot_args\" $TORTURE_QEMU_GDB_ARG > $resdir/qemu-cmd
+echo "# TORTURE_SHUTDOWN_GRACE=$TORTURE_SHUTDOWN_GRACE" >> $resdir/qemu-cmd
+echo "# seconds=$seconds" >> $resdir/qemu-cmd
if test -n "$TORTURE_BUILDONLY"
then
@@ -174,6 +175,7 @@ echo 'echo $! > $resdir/qemu_pid' >> $T/qemu-cmd
echo "NOTE: $QEMU either did not run or was interactive" > $resdir/console.log
# Attempt to run qemu
+kstarttime=`gawk 'BEGIN { print systime() }' < /dev/null`
( . $T/qemu-cmd; wait `cat $resdir/qemu_pid`; echo $? > $resdir/qemu-retval ) &
commandcompleted=0
if test -z "$TORTURE_KCONFIG_GDB_ARG"
@@ -209,7 +211,7 @@ do
if test -n "$TORTURE_KCONFIG_GDB_ARG"
then
:
- elif test $kruntime -ge $seconds || test -f "$TORTURE_STOPFILE"
+ elif test $kruntime -ge $seconds || test -f "$resdir/../STOP.1"
then
break;
fi
@@ -252,16 +254,16 @@ then
fi
if test $commandcompleted -eq 0 -a -n "$qemu_pid"
then
- if ! test -f "$TORTURE_STOPFILE"
+ if ! test -f "$resdir/../STOP.1"
then
echo Grace period for qemu job at pid $qemu_pid
fi
oldline="`tail $resdir/console.log`"
while :
do
- if test -f "$TORTURE_STOPFILE"
+ if test -f "$resdir/../STOP.1"
then
- echo "PID $qemu_pid killed due to run STOP request" >> $resdir/Warnings 2>&1
+ echo "PID $qemu_pid killed due to run STOP.1 request" >> $resdir/Warnings 2>&1
kill -KILL $qemu_pid
break
fi
diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh
index 45d07b7b69f59..8d3c99b35e060 100755
--- a/tools/testing/selftests/rcutorture/bin/kvm.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm.sh
@@ -47,6 +47,9 @@ cpus=0
ds=`date +%Y.%m.%d-%H.%M.%S`
jitter="-1"
+startdate="`date`"
+starttime="`get_starttime`"
+
usage () {
echo "Usage: $scriptname optional arguments:"
echo " --allcpus"
@@ -57,7 +60,7 @@ usage () {
echo " --cpus N"
echo " --datestamp string"
echo " --defconfig string"
- echo " --dryrun sched|script"
+ echo " --dryrun batches|sched|script"
echo " --duration minutes | <seconds>s | <hours>h | <days>d"
echo " --gdb"
echo " --help"
@@ -85,7 +88,7 @@ do
;;
--bootargs|--bootarg)
checkarg --bootargs "(list of kernel boot arguments)" "$#" "$2" '.*' '^--'
- TORTURE_BOOTARGS="$2"
+ TORTURE_BOOTARGS="$TORTURE_BOOTARGS $2"
shift
;;
--bootimage)
@@ -97,8 +100,8 @@ do
TORTURE_BUILDONLY=1
;;
--configs|--config)
- checkarg --configs "(list of config files)" "$#" "$2" '^[^/]*$' '^--'
- configs="$2"
+ checkarg --configs "(list of config files)" "$#" "$2" '^[^/]\+$' '^--'
+ configs="$configs $2"
shift
;;
--cpus)
@@ -113,7 +116,7 @@ do
shift
;;
--datestamp)
- checkarg --datestamp "(relative pathname)" "$#" "$2" '^[^/]*$' '^--'
+ checkarg --datestamp "(relative pathname)" "$#" "$2" '^[a-zA-Z0-9._-/]*$' '^--'
ds=$2
shift
;;
@@ -123,7 +126,7 @@ do
shift
;;
--dryrun)
- checkarg --dryrun "sched|script" $# "$2" 'sched\|script' '^--'
+ checkarg --dryrun "batches|sched|script" $# "$2" 'batches\|sched\|script' '^--'
dryrun=$2
shift
;;
@@ -162,18 +165,18 @@ do
;;
--kconfig|--kconfigs)
checkarg --kconfig "(Kconfig options)" $# "$2" '^CONFIG_[A-Z0-9_]\+=\([ynm]\|[0-9]\+\)\( CONFIG_[A-Z0-9_]\+=\([ynm]\|[0-9]\+\)\)*$' '^error$'
- TORTURE_KCONFIG_ARG="$2"
+ TORTURE_KCONFIG_ARG="`echo "$TORTURE_KCONFIG_ARG $2" | sed -e 's/^ *//' -e 's/ *$//'`"
shift
;;
--kasan)
TORTURE_KCONFIG_KASAN_ARG="CONFIG_DEBUG_INFO=y CONFIG_KASAN=y"; export TORTURE_KCONFIG_KASAN_ARG
;;
--kcsan)
- TORTURE_KCONFIG_KCSAN_ARG="CONFIG_DEBUG_INFO=y CONFIG_KCSAN=y CONFIG_KCSAN_ASSUME_PLAIN_WRITES_ATOMIC=n CONFIG_KCSAN_REPORT_VALUE_CHANGE_ONLY=n CONFIG_KCSAN_REPORT_ONCE_IN_MS=100000 CONFIG_KCSAN_VERBOSE=y CONFIG_KCSAN_INTERRUPT_WATCHER=y"; export TORTURE_KCONFIG_KCSAN_ARG
+ TORTURE_KCONFIG_KCSAN_ARG="CONFIG_DEBUG_INFO=y CONFIG_KCSAN=y CONFIG_KCSAN_ASSUME_PLAIN_WRITES_ATOMIC=n CONFIG_KCSAN_REPORT_VALUE_CHANGE_ONLY=n CONFIG_KCSAN_REPORT_ONCE_IN_MS=100000 CONFIG_KCSAN_INTERRUPT_WATCHER=y CONFIG_KCSAN_VERBOSE=y CONFIG_DEBUG_LOCK_ALLOC=y CONFIG_PROVE_LOCKING=y"; export TORTURE_KCONFIG_KCSAN_ARG
;;
--kmake-arg|--kmake-args)
checkarg --kmake-arg "(kernel make arguments)" $# "$2" '.*' '^error$'
- TORTURE_KMAKE_ARG="$2"
+ TORTURE_KMAKE_ARG="`echo "$TORTURE_KMAKE_ARG $2" | sed -e 's/^ *//' -e 's/ *$//'`"
shift
;;
--mac)
@@ -191,7 +194,7 @@ do
;;
--qemu-args|--qemu-arg)
checkarg --qemu-args "(qemu arguments)" $# "$2" '^-' '^error'
- TORTURE_QEMU_ARG="$2"
+ TORTURE_QEMU_ARG="`echo "$TORTURE_QEMU_ARG $2" | sed -e 's/^ *//' -e 's/ *$//'`"
shift
;;
--qemu-cmd)
@@ -232,7 +235,7 @@ do
shift
done
-if test -z "$TORTURE_INITRD" || tools/testing/selftests/rcutorture/bin/mkinitrd.sh
+if test -n "$dryrun" || test -z "$TORTURE_INITRD" || tools/testing/selftests/rcutorture/bin/mkinitrd.sh
then
:
else
@@ -283,19 +286,34 @@ then
exit 1
fi
fi
-for CF1 in $configs_derep
+echo 'BEGIN {' > $T/cfgcpu.awk
+for CF1 in `echo $configs_derep | tr -s ' ' '\012' | sort -u`
do
if test -f "$CONFIGFRAG/$CF1"
then
- cpu_count=`configNR_CPUS.sh $CONFIGFRAG/$CF1`
+ if echo "$TORTURE_KCONFIG_ARG" | grep -q '\<CONFIG_NR_CPUS='
+ then
+ echo "$TORTURE_KCONFIG_ARG" | tr -s ' ' | tr ' ' '\012' > $T/KCONFIG_ARG
+ cpu_count=`configNR_CPUS.sh $T/KCONFIG_ARG`
+ else
+ cpu_count=`configNR_CPUS.sh $CONFIGFRAG/$CF1`
+ fi
cpu_count=`configfrag_boot_cpus "$TORTURE_BOOTARGS" "$CONFIGFRAG/$CF1" "$cpu_count"`
cpu_count=`configfrag_boot_maxcpus "$TORTURE_BOOTARGS" "$CONFIGFRAG/$CF1" "$cpu_count"`
- echo $CF1 $cpu_count >> $T/cfgcpu
+ echo 'scenariocpu["'"$CF1"'"] = '"$cpu_count"';' >> $T/cfgcpu.awk
else
echo "The --configs file $CF1 does not exist, terminating."
exit 1
fi
done
+cat << '___EOF___' >> $T/cfgcpu.awk
+}
+{
+ for (i = 1; i <= NF; i++)
+ print $i, scenariocpu[$i];
+}
+___EOF___
+echo $configs_derep | awk -f $T/cfgcpu.awk > $T/cfgcpu
sort -k2nr $T/cfgcpu -T="$T" > $T/cfgcpu.sort
# Use a greedy bin-packing algorithm, sorting the list accordingly.
@@ -315,11 +333,10 @@ END {
batch = 0;
nc = -1;
- # Each pass through the following loop creates on test batch
- # that can be executed concurrently given ncpus. Note that a
- # given test that requires more than the available CPUs will run in
- # their own batch. Such tests just have to make do with what
- # is available.
+ # Each pass through the following loop creates on test batch that
+ # can be executed concurrently given ncpus. Note that a given test
+ # that requires more than the available CPUs will run in its own
+ # batch. Such tests just have to make do with what is available.
while (nc != ncpus) {
batch++;
nc = ncpus;
@@ -375,9 +392,9 @@ if ! test -e $resdir
then
mkdir -p "$resdir" || :
fi
-mkdir $resdir/$ds
+mkdir -p $resdir/$ds
TORTURE_RESDIR="$resdir/$ds"; export TORTURE_RESDIR
-TORTURE_STOPFILE="$resdir/$ds/STOP"; export TORTURE_STOPFILE
+TORTURE_STOPFILE="$resdir/$ds/STOP.1"; export TORTURE_STOPFILE
echo Results directory: $resdir/$ds
echo $scriptname $args
touch $resdir/$ds/log
@@ -517,14 +534,19 @@ END {
dump(first, i, batchnum);
}' >> $T/script
+cat << '___EOF___' >> $T/script
+echo | tee -a $TORTURE_RESDIR/log
+echo | tee -a $TORTURE_RESDIR/log
+echo " --- `date` Test summary:" | tee -a $TORTURE_RESDIR/log
+___EOF___
cat << ___EOF___ >> $T/script
-echo
-echo
-echo " --- `date` Test summary:"
-echo Results directory: $resdir/$ds
-kcsan-collapse.sh $resdir/$ds
-kvm-recheck.sh $resdir/$ds
+echo Results directory: $resdir/$ds | tee -a $resdir/$ds/log
+kcsan-collapse.sh $resdir/$ds | tee -a $resdir/$ds/log
+kvm-recheck.sh $resdir/$ds > $T/kvm-recheck.sh.out 2>&1
___EOF___
+echo 'ret=$?' >> $T/script
+echo "cat $T/kvm-recheck.sh.out | tee -a $resdir/$ds/log" >> $T/script
+echo 'exit $ret' >> $T/script
if test "$dryrun" = script
then
@@ -533,13 +555,34 @@ then
elif test "$dryrun" = sched
then
# Extract the test run schedule from the script.
- egrep 'Start batch|Starting build\.' $T/script |
- grep -v ">>" |
+ egrep 'Start batch|Starting build\.' $T/script | grep -v ">>" |
sed -e 's/:.*$//' -e 's/^echo //'
+ nbuilds="`grep 'Starting build\.' $T/script |
+ grep -v ">>" | sed -e 's/:.*$//' -e 's/^echo //' |
+ awk '{ print $1 }' | grep -v '\.' | wc -l`"
+ echo Total number of builds: $nbuilds
+ nbatches="`grep 'Start batch' $T/script | grep -v ">>" | wc -l`"
+ echo Total number of batches: $nbatches
exit 0
+elif test "$dryrun" = batches
+then
+ # Extract the tests and their batches from the script.
+ egrep 'Start batch|Starting build\.' $T/script | grep -v ">>" |
+ sed -e 's/:.*$//' -e 's/^echo //' -e 's/-ovf//' |
+ awk '
+ /^----Start/ {
+ batchno = $3;
+ next;
+ }
+ {
+ print batchno, $1, $2
+ }'
else
# Not a dryrun, so run the script.
- sh $T/script
+ bash $T/script
+ ret=$?
+ echo " --- Done at `date` (`get_starttime_duration $starttime`) exitcode $ret" | tee -a $resdir/$ds/log
+ exit $ret
fi
# Tracing: trace_event=rcu:rcu_grace_period,rcu:rcu_future_grace_period,rcu:rcu_grace_period_init,rcu:rcu_nocb_wake,rcu:rcu_preempt_task,rcu:rcu_unlock_preempted_task,rcu:rcu_quiescent_state_report,rcu:rcu_fqs,rcu:rcu_callback,rcu:rcu_kfree_callback,rcu:rcu_batch_start,rcu:rcu_invoke_callback,rcu:rcu_invoke_kfree_callback,rcu:rcu_batch_end,rcu:rcu_torture_read,rcu:rcu_barrier
diff --git a/tools/testing/selftests/rcutorture/bin/mkinitrd.sh b/tools/testing/selftests/rcutorture/bin/mkinitrd.sh
index 38e424d2392cc..70d62fd0d31d4 100755
--- a/tools/testing/selftests/rcutorture/bin/mkinitrd.sh
+++ b/tools/testing/selftests/rcutorture/bin/mkinitrd.sh
@@ -70,7 +70,7 @@ if echo -e "#if __x86_64__||__i386__||__i486__||__i586__||__i686__" \
# architecture supported by nolibc
${CROSS_COMPILE}gcc -fno-asynchronous-unwind-tables -fno-ident \
-nostdlib -include ../../../../include/nolibc/nolibc.h \
- -lgcc -s -static -Os -o init init.c
+ -s -static -Os -o init init.c -lgcc
else
${CROSS_COMPILE}gcc -s -static -Os -o init init.c
fi
diff --git a/tools/testing/selftests/rcutorture/bin/parse-build.sh b/tools/testing/selftests/rcutorture/bin/parse-build.sh
index 09155c15ea651..9313e5065ae92 100755
--- a/tools/testing/selftests/rcutorture/bin/parse-build.sh
+++ b/tools/testing/selftests/rcutorture/bin/parse-build.sh
@@ -21,7 +21,7 @@ mkdir $T
. functions.sh
-if grep -q CC < $F || test -n "$TORTURE_TRUST_MAKE"
+if grep -q CC < $F || test -n "$TORTURE_TRUST_MAKE" || grep -qe --trust-make < `dirname $F`/../log
then
:
else
diff --git a/tools/testing/selftests/rcutorture/bin/parse-console.sh b/tools/testing/selftests/rcutorture/bin/parse-console.sh
index 263b1be500080..9f624bd53c277 100755
--- a/tools/testing/selftests/rcutorture/bin/parse-console.sh
+++ b/tools/testing/selftests/rcutorture/bin/parse-console.sh
@@ -128,7 +128,7 @@ then
then
summary="$summary Badness: $n_badness"
fi
- n_warn=`grep -v 'Warning: unable to open an initial console' $file | egrep -c 'WARNING:|Warn'`
+ n_warn=`grep -v 'Warning: unable to open an initial console' $file | grep -v 'Warning: Failed to add ttynull console. No stdin, stdout, and stderr for the init process' | egrep -c 'WARNING:|Warn'`
if test "$n_warn" -ne 0
then
summary="$summary Warnings: $n_warn"
diff --git a/tools/testing/selftests/rcutorture/bin/torture.sh b/tools/testing/selftests/rcutorture/bin/torture.sh
new file mode 100755
index 0000000000000..ad7525b7ac297
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/bin/torture.sh
@@ -0,0 +1,442 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0+
+#
+# Run a series of torture tests, intended for overnight or
+# longer timeframes, and also for large systems.
+#
+# Usage: torture.sh [ options ]
+#
+# Copyright (C) 2020 Facebook, Inc.
+#
+# Authors: Paul E. McKenney <paulmck@kernel.org>
+
+scriptname=$0
+args="$*"
+
+KVM="`pwd`/tools/testing/selftests/rcutorture"; export KVM
+PATH=${KVM}/bin:$PATH; export PATH
+. functions.sh
+
+TORTURE_ALLOTED_CPUS="`identify_qemu_vcpus`"
+MAKE_ALLOTED_CPUS=$((TORTURE_ALLOTED_CPUS*2))
+HALF_ALLOTED_CPUS=$((TORTURE_ALLOTED_CPUS/2))
+if test "$HALF_ALLOTED_CPUS" -lt 1
+then
+ HALF_ALLOTED_CPUS=1
+fi
+VERBOSE_BATCH_CPUS=$((TORTURE_ALLOTED_CPUS/16))
+if test "$VERBOSE_BATCH_CPUS" -lt 2
+then
+ VERBOSE_BATCH_CPUS=0
+fi
+
+# Configurations/scenarios.
+configs_rcutorture=
+configs_locktorture=
+configs_scftorture=
+kcsan_kmake_args=
+
+# Default compression, duration, and apportionment.
+compress_kasan_vmlinux="`identify_qemu_vcpus`"
+duration_base=10
+duration_rcutorture_frac=7
+duration_locktorture_frac=1
+duration_scftorture_frac=2
+
+# "yes" or "no" parameters
+do_allmodconfig=yes
+do_rcutorture=yes
+do_locktorture=yes
+do_scftorture=yes
+do_rcuscale=yes
+do_refscale=yes
+do_kvfree=yes
+do_kasan=yes
+do_kcsan=no
+
+# doyesno - Helper function for yes/no arguments
+function doyesno () {
+ if test "$1" = "$2"
+ then
+ echo yes
+ else
+ echo no
+ fi
+}
+
+usage () {
+ echo "Usage: $scriptname optional arguments:"
+ echo " --compress-kasan-vmlinux concurrency"
+ echo " --configs-rcutorture \"config-file list w/ repeat factor (3*TINY01)\""
+ echo " --configs-locktorture \"config-file list w/ repeat factor (10*LOCK01)\""
+ echo " --configs-scftorture \"config-file list w/ repeat factor (2*CFLIST)\""
+ echo " --doall"
+ echo " --doallmodconfig / --do-no-allmodconfig"
+ echo " --do-kasan / --do-no-kasan"
+ echo " --do-kcsan / --do-no-kcsan"
+ echo " --do-kvfree / --do-no-kvfree"
+ echo " --do-locktorture / --do-no-locktorture"
+ echo " --do-none"
+ echo " --do-rcuscale / --do-no-rcuscale"
+ echo " --do-rcutorture / --do-no-rcutorture"
+ echo " --do-refscale / --do-no-refscale"
+ echo " --do-scftorture / --do-no-scftorture"
+ echo " --duration [ <minutes> | <hours>h | <days>d ]"
+ echo " --kcsan-kmake-arg kernel-make-arguments"
+ exit 1
+}
+
+while test $# -gt 0
+do
+ case "$1" in
+ --compress-kasan-vmlinux)
+ checkarg --compress-kasan-vmlinux "(concurrency level)" $# "$2" '^[0-9][0-9]*$' '^error'
+ compress_kasan_vmlinux=$2
+ shift
+ ;;
+ --config-rcutorture|--configs-rcutorture)
+ checkarg --configs-rcutorture "(list of config files)" "$#" "$2" '^[^/]\+$' '^--'
+ configs_rcutorture="$configs_rcutorture $2"
+ shift
+ ;;
+ --config-locktorture|--configs-locktorture)
+ checkarg --configs-locktorture "(list of config files)" "$#" "$2" '^[^/]\+$' '^--'
+ configs_locktorture="$configs_locktorture $2"
+ shift
+ ;;
+ --config-scftorture|--configs-scftorture)
+ checkarg --configs-scftorture "(list of config files)" "$#" "$2" '^[^/]\+$' '^--'
+ configs_scftorture="$configs_scftorture $2"
+ shift
+ ;;
+ --doall)
+ do_allmodconfig=yes
+ do_rcutorture=yes
+ do_locktorture=yes
+ do_scftorture=yes
+ do_rcuscale=yes
+ do_refscale=yes
+ do_kvfree=yes
+ do_kasan=yes
+ do_kcsan=yes
+ ;;
+ --do-allmodconfig|--do-no-allmodconfig)
+ do_allmodconfig=`doyesno "$1" --do-allmodconfig`
+ ;;
+ --do-kasan|--do-no-kasan)
+ do_kasan=`doyesno "$1" --do-kasan`
+ ;;
+ --do-kcsan|--do-no-kcsan)
+ do_kcsan=`doyesno "$1" --do-kcsan`
+ ;;
+ --do-kvfree|--do-no-kvfree)
+ do_kvfree=`doyesno "$1" --do-kvfree`
+ ;;
+ --do-locktorture|--do-no-locktorture)
+ do_locktorture=`doyesno "$1" --do-locktorture`
+ ;;
+ --do-none)
+ do_allmodconfig=no
+ do_rcutorture=no
+ do_locktorture=no
+ do_scftorture=no
+ do_rcuscale=no
+ do_refscale=no
+ do_kvfree=no
+ do_kasan=no
+ do_kcsan=no
+ ;;
+ --do-rcuscale|--do-no-rcuscale)
+ do_rcuscale=`doyesno "$1" --do-rcuscale`
+ ;;
+ --do-rcutorture|--do-no-rcutorture)
+ do_rcutorture=`doyesno "$1" --do-rcutorture`
+ ;;
+ --do-refscale|--do-no-refscale)
+ do_refscale=`doyesno "$1" --do-refscale`
+ ;;
+ --do-scftorture|--do-no-scftorture)
+ do_scftorture=`doyesno "$1" --do-scftorture`
+ ;;
+ --duration)
+ checkarg --duration "(minutes)" $# "$2" '^[0-9][0-9]*\(m\|h\|d\|\)$' '^error'
+ mult=1
+ if echo "$2" | grep -q 'm$'
+ then
+ mult=1
+ elif echo "$2" | grep -q 'h$'
+ then
+ mult=60
+ elif echo "$2" | grep -q 'd$'
+ then
+ mult=1440
+ fi
+ ts=`echo $2 | sed -e 's/[smhd]$//'`
+ duration_base=$(($ts*mult))
+ shift
+ ;;
+ --kcsan-kmake-arg|--kcsan-kmake-args)
+ checkarg --kcsan-kmake-arg "(kernel make arguments)" $# "$2" '.*' '^error$'
+ kcsan_kmake_args="`echo "$kcsan_kmake_args $2" | sed -e 's/^ *//' -e 's/ *$//'`"
+ shift
+ ;;
+ *)
+ echo Unknown argument $1
+ usage
+ ;;
+ esac
+ shift
+done
+
+ds="`date +%Y.%m.%d-%H.%M.%S`-torture"
+startdate="`date`"
+starttime="`get_starttime`"
+
+T=/tmp/torture.sh.$$
+trap 'rm -rf $T' 0 2
+mkdir $T
+
+echo " --- " $scriptname $args | tee -a $T/log
+echo " --- Results directory: " $ds | tee -a $T/log
+
+# Calculate rcutorture defaults and apportion time
+if test -z "$configs_rcutorture"
+then
+ configs_rcutorture=CFLIST
+fi
+duration_rcutorture=$((duration_base*duration_rcutorture_frac/10))
+if test "$duration_rcutorture" -eq 0
+then
+ echo " --- Zero time for rcutorture, disabling" | tee -a $T/log
+ do_rcutorture=no
+fi
+
+# Calculate locktorture defaults and apportion time
+if test -z "$configs_locktorture"
+then
+ configs_locktorture=CFLIST
+fi
+duration_locktorture=$((duration_base*duration_locktorture_frac/10))
+if test "$duration_locktorture" -eq 0
+then
+ echo " --- Zero time for locktorture, disabling" | tee -a $T/log
+ do_locktorture=no
+fi
+
+# Calculate scftorture defaults and apportion time
+if test -z "$configs_scftorture"
+then
+ configs_scftorture=CFLIST
+fi
+duration_scftorture=$((duration_base*duration_scftorture_frac/10))
+if test "$duration_scftorture" -eq 0
+then
+ echo " --- Zero time for scftorture, disabling" | tee -a $T/log
+ do_scftorture=no
+fi
+
+touch $T/failures
+touch $T/successes
+
+# torture_one - Does a single kvm.sh run.
+#
+# Usage:
+# torture_bootargs="[ kernel boot arguments ]"
+# torture_one flavor [ kvm.sh arguments ]
+#
+# Note that "flavor" is an arbitrary string. Supply --torture if needed.
+# Note that quoting is problematic. So on the command line, pass multiple
+# values with multiple kvm.sh argument instances.
+function torture_one {
+ local cur_bootargs=
+ local boottag=
+
+ echo " --- $curflavor:" Start `date` | tee -a $T/log
+ if test -n "$torture_bootargs"
+ then
+ boottag="--bootargs"
+ cur_bootargs="$torture_bootargs"
+ fi
+ "$@" $boottag "$cur_bootargs" --datestamp "$ds/results-$curflavor" > $T/$curflavor.out 2>&1
+ retcode=$?
+ resdir="`grep '^Results directory: ' $T/$curflavor.out | tail -1 | sed -e 's/^Results directory: //'`"
+ if test -z "$resdir"
+ then
+ cat $T/$curflavor.out | tee -a $T/log
+ echo retcode=$retcode | tee -a $T/log
+ fi
+ if test "$retcode" == 0
+ then
+ echo "$curflavor($retcode)" $resdir >> $T/successes
+ else
+ echo "$curflavor($retcode)" $resdir >> $T/failures
+ fi
+}
+
+# torture_set - Does a set of tortures with and without KASAN and KCSAN.
+#
+# Usage:
+# torture_bootargs="[ kernel boot arguments ]"
+# torture_set flavor [ kvm.sh arguments ]
+#
+# Note that "flavor" is an arbitrary string. Supply --torture if needed.
+# Note that quoting is problematic. So on the command line, pass multiple
+# values with multiple kvm.sh argument instances.
+function torture_set {
+ local cur_kcsan_kmake_args=
+ local kcsan_kmake_tag=
+ local flavor=$1
+ shift
+ curflavor=$flavor
+ torture_one "$@"
+ if test "$do_kasan" = "yes"
+ then
+ curflavor=${flavor}-kasan
+ torture_one "$@" --kasan
+ fi
+ if test "$do_kcsan" = "yes"
+ then
+ curflavor=${flavor}-kcsan
+ if test -n "$kcsan_kmake_args"
+ then
+ kcsan_kmake_tag="--kmake-args"
+ cur_kcsan_kmake_args="$kcsan_kmake_args"
+ fi
+ torture_one $* --kconfig "CONFIG_DEBUG_LOCK_ALLOC=y CONFIG_PROVE_LOCKING=y" $kcsan_kmake_tag $cur_kcsan_kmake_args --kcsan
+ fi
+}
+
+# make allmodconfig
+if test "$do_allmodconfig" = "yes"
+then
+ echo " --- allmodconfig:" Start `date` | tee -a $T/log
+ amcdir="tools/testing/selftests/rcutorture/res/$ds/allmodconfig"
+ mkdir -p "$amcdir"
+ echo " --- make clean" > "$amcdir/Make.out" 2>&1
+ make -j$MAKE_ALLOTED_CPUS clean >> "$amcdir/Make.out" 2>&1
+ echo " --- make allmodconfig" >> "$amcdir/Make.out" 2>&1
+ make -j$MAKE_ALLOTED_CPUS allmodconfig >> "$amcdir/Make.out" 2>&1
+ echo " --- make " >> "$amcdir/Make.out" 2>&1
+ make -j$MAKE_ALLOTED_CPUS >> "$amcdir/Make.out" 2>&1
+ retcode="$?"
+ echo $retcode > "$amcdir/Make.exitcode"
+ if test "$retcode" == 0
+ then
+ echo "allmodconfig($retcode)" $amcdir >> $T/successes
+ else
+ echo "allmodconfig($retcode)" $amcdir >> $T/failures
+ fi
+fi
+
+# --torture rcu
+if test "$do_rcutorture" = "yes"
+then
+ torture_bootargs="rcupdate.rcu_cpu_stall_suppress_at_boot=1 torture.disable_onoff_at_boot rcupdate.rcu_task_stall_timeout=30000"
+ torture_set "rcutorture" tools/testing/selftests/rcutorture/bin/kvm.sh --allcpus --duration "$duration_rcutorture" --configs "$configs_rcutorture" --trust-make
+fi
+
+if test "$do_locktorture" = "yes"
+then
+ torture_bootargs="torture.disable_onoff_at_boot"
+ torture_set "locktorture" tools/testing/selftests/rcutorture/bin/kvm.sh --torture lock --allcpus --duration "$duration_locktorture" --configs "$configs_locktorture" --trust-make
+fi
+
+if test "$do_scftorture" = "yes"
+then
+ torture_bootargs="scftorture.nthreads=$HALF_ALLOTED_CPUS torture.disable_onoff_at_boot"
+ torture_set "scftorture" tools/testing/selftests/rcutorture/bin/kvm.sh --torture scf --allcpus --duration "$duration_scftorture" --configs "$configs_scftorture" --kconfig "CONFIG_NR_CPUS=$HALF_ALLOTED_CPUS" --trust-make
+fi
+
+if test "$do_refscale" = yes
+then
+ primlist="`grep '\.name[ ]*=' kernel/rcu/refscale.c | sed -e 's/^[^"]*"//' -e 's/".*$//'`"
+else
+ primlist=
+fi
+for prim in $primlist
+do
+ torture_bootargs="refscale.scale_type="$prim" refscale.nreaders=$HALF_ALLOTED_CPUS refscale.loops=10000 refscale.holdoff=20 torture.disable_onoff_at_boot"
+ torture_set "refscale-$prim" tools/testing/selftests/rcutorture/bin/kvm.sh --torture refscale --allcpus --duration 5 --kconfig "CONFIG_NR_CPUS=$HALF_ALLOTED_CPUS" --bootargs "verbose_batched=$VERBOSE_BATCH_CPUS torture.verbose_sleep_frequency=8 torture.verbose_sleep_duration=$VERBOSE_BATCH_CPUS" --trust-make
+done
+
+if test "$do_rcuscale" = yes
+then
+ primlist="`grep '\.name[ ]*=' kernel/rcu/rcuscale.c | sed -e 's/^[^"]*"//' -e 's/".*$//'`"
+else
+ primlist=
+fi
+for prim in $primlist
+do
+ torture_bootargs="rcuscale.scale_type="$prim" rcuscale.nwriters=$HALF_ALLOTED_CPUS rcuscale.holdoff=20 torture.disable_onoff_at_boot"
+ torture_set "rcuscale-$prim" tools/testing/selftests/rcutorture/bin/kvm.sh --torture rcuscale --allcpus --duration 5 --kconfig "CONFIG_NR_CPUS=$HALF_ALLOTED_CPUS" --trust-make
+done
+
+if test "$do_kvfree" = "yes"
+then
+ torture_bootargs="rcuscale.kfree_rcu_test=1 rcuscale.kfree_nthreads=16 rcuscale.holdoff=20 rcuscale.kfree_loops=10000 torture.disable_onoff_at_boot"
+ torture_set "rcuscale-kvfree" tools/testing/selftests/rcutorture/bin/kvm.sh --torture rcuscale --allcpus --duration 10 --kconfig "CONFIG_NR_CPUS=$HALF_ALLOTED_CPUS" --trust-make
+fi
+
+echo " --- " $scriptname $args
+echo " --- " Done `date` | tee -a $T/log
+ret=0
+nsuccesses=0
+echo SUCCESSES: | tee -a $T/log
+if test -s "$T/successes"
+then
+ cat "$T/successes" | tee -a $T/log
+ nsuccesses="`wc -l "$T/successes" | awk '{ print $1 }'`"
+fi
+nfailures=0
+echo FAILURES: | tee -a $T/log
+if test -s "$T/failures"
+then
+ cat "$T/failures" | tee -a $T/log
+ nfailures="`wc -l "$T/failures" | awk '{ print $1 }'`"
+ ret=2
+fi
+echo Started at $startdate, ended at `date`, duration `get_starttime_duration $starttime`. | tee -a $T/log
+echo Summary: Successes: $nsuccesses Failures: $nfailures. | tee -a $T/log
+tdir="`cat $T/successes $T/failures | head -1 | awk '{ print $NF }' | sed -e 's,/[^/]\+/*$,,'`"
+if test -n "$tdir" && test $compress_kasan_vmlinux -gt 0
+then
+ # KASAN vmlinux files can approach 1GB in size, so compress them.
+ echo Looking for KASAN files to compress: `date` > "$tdir/log-xz" 2>&1
+ find "$tdir" -type d -name '*-kasan' -print > $T/xz-todo
+ ncompresses=0
+ batchno=1
+ if test -s $T/xz-todo
+ then
+ echo Size before compressing: `du -sh $tdir | awk '{ print $1 }'` `date` 2>&1 | tee -a "$tdir/log-xz" | tee -a $T/log
+ for i in `cat $T/xz-todo`
+ do
+ echo Compressing vmlinux files in ${i}: `date` >> "$tdir/log-xz" 2>&1
+ for j in $i/*/vmlinux
+ do
+ xz "$j" >> "$tdir/log-xz" 2>&1 &
+ ncompresses=$((ncompresses+1))
+ if test $ncompresses -ge $compress_kasan_vmlinux
+ then
+ echo Waiting for batch $batchno of $ncompresses compressions `date` | tee -a "$tdir/log-xz" | tee -a $T/log
+ wait
+ ncompresses=0
+ batchno=$((batchno+1))
+ fi
+ done
+ done
+ if test $ncompresses -gt 0
+ then
+ echo Waiting for final batch $batchno of $ncompresses compressions `date` | tee -a "$tdir/log-xz" | tee -a $T/log
+ fi
+ wait
+ echo Size after compressing: `du -sh $tdir | awk '{ print $1 }'` `date` 2>&1 | tee -a "$tdir/log-xz" | tee -a $T/log
+ echo Total duration `get_starttime_duration $starttime`. | tee -a $T/log
+ else
+ echo No compression needed: `date` >> "$tdir/log-xz" 2>&1
+ fi
+fi
+if test -n "$tdir"
+then
+ cp $T/log "$tdir"
+fi
+exit $ret
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/RUDE01.boot b/tools/testing/selftests/rcutorture/configs/rcu/RUDE01.boot
index 9363708c9075c..932a0799eb084 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/RUDE01.boot
+++ b/tools/testing/selftests/rcutorture/configs/rcu/RUDE01.boot
@@ -1 +1,2 @@
rcutorture.torture_type=tasks-rude
+rcutree.use_softirq=0
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TASKS01.boot b/tools/testing/selftests/rcutorture/configs/rcu/TASKS01.boot
index cd2a188eeb6d9..22cdeced98ea8 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/TASKS01.boot
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TASKS01.boot
@@ -1 +1,2 @@
rcutorture.torture_type=tasks
+rcutree.use_softirq=0
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE01.boot b/tools/testing/selftests/rcutorture/configs/rcu/TREE01.boot
index d6da9a61d44af..40af3df0f397f 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/TREE01.boot
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE01.boot
@@ -2,5 +2,7 @@ maxcpus=8 nr_cpus=43
rcutree.gp_preinit_delay=3
rcutree.gp_init_delay=3
rcutree.gp_cleanup_delay=3
-rcu_nocbs=0
+rcu_nocbs=0-1,3-7
+rcutorture.nocbs_nthreads=8
+rcutorture.nocbs_toggle=1000
rcutorture.fwd_progress=0
diff --git a/tools/testing/selftests/x86/helpers.h b/tools/testing/selftests/x86/helpers.h
index f5ff2a2615df0..4ef42c4559a9e 100644
--- a/tools/testing/selftests/x86/helpers.h
+++ b/tools/testing/selftests/x86/helpers.h
@@ -6,36 +6,20 @@
static inline unsigned long get_eflags(void)
{
- unsigned long eflags;
-
- asm volatile (
#ifdef __x86_64__
- "subq $128, %%rsp\n\t"
- "pushfq\n\t"
- "popq %0\n\t"
- "addq $128, %%rsp"
+ return __builtin_ia32_readeflags_u64();
#else
- "pushfl\n\t"
- "popl %0"
+ return __builtin_ia32_readeflags_u32();
#endif
- : "=r" (eflags) :: "memory");
-
- return eflags;
}
static inline void set_eflags(unsigned long eflags)
{
- asm volatile (
#ifdef __x86_64__
- "subq $128, %%rsp\n\t"
- "pushq %0\n\t"
- "popfq\n\t"
- "addq $128, %%rsp"
+ __builtin_ia32_writeeflags_u64(eflags);
#else
- "pushl %0\n\t"
- "popfl"
+ __builtin_ia32_writeeflags_u32(eflags);
#endif
- :: "r" (eflags) : "flags", "memory");
}
#endif /* __SELFTESTS_X86_HELPERS_H */