aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorStephen Rothwell <sfr@canb.auug.org.au>2022-02-11 18:05:29 +1100
committerStephen Rothwell <sfr@canb.auug.org.au>2022-02-11 18:05:31 +1100
commita00df6c1c55a7e8cf10d987488868c4976f46201 (patch)
tree9eabac22484f4ea90e82aa7a6b705123680f0b93
parentfeb823bca03528cf31cde7771c674e5b0b58854d (diff)
parent823635261f43ee0a99f44a1fec22cb46097b6ee9 (diff)
downloadlinux-next-a00df6c1c55a7e8cf10d987488868c4976f46201.tar.gz
Merge branch 'akpm-current/current'
Notice: this object is not reachable from any branch.
Notice: this object is not reachable from any branch.
-rw-r--r--Documentation/admin-guide/cgroup-v2.rst5
-rw-r--r--Documentation/admin-guide/kdump/kdump.rst10
-rw-r--r--Documentation/admin-guide/kernel-parameters.txt3
-rw-r--r--Documentation/admin-guide/mm/damon/usage.rst24
-rw-r--r--Documentation/admin-guide/mm/zswap.rst22
-rw-r--r--Documentation/admin-guide/sysctl/kernel.rst31
-rw-r--r--Documentation/dev-tools/kasan.rst17
-rw-r--r--Documentation/vm/balance.rst2
-rw-r--r--Documentation/vm/page_owner.rst29
-rw-r--r--arch/arm/Kconfig4
-rw-r--r--arch/arm64/Kconfig6
-rw-r--r--arch/arm64/include/asm/vmalloc.h6
-rw-r--r--arch/arm64/include/asm/vmap_stack.h5
-rw-r--r--arch/arm64/kernel/module.c5
-rw-r--r--arch/arm64/kernel/setup.c3
-rw-r--r--arch/arm64/mm/hugetlbpage.c1
-rw-r--r--arch/arm64/mm/init.c9
-rw-r--r--arch/arm64/mm/pageattr.c2
-rw-r--r--arch/arm64/net/bpf_jit_comp.c3
-rw-r--r--arch/hexagon/mm/init.c2
-rw-r--r--arch/ia64/kernel/topology.c10
-rw-r--r--arch/ia64/mm/discontig.c11
-rw-r--r--arch/mips/kernel/topology.c5
-rw-r--r--arch/nds32/mm/init.c1
-rw-r--r--arch/openrisc/mm/init.c2
-rw-r--r--arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h4
-rw-r--r--arch/powerpc/kernel/fadump.c6
-rw-r--r--arch/powerpc/kernel/sysfs.c17
-rw-r--r--arch/powerpc/mm/book3s64/trace.c1
-rw-r--r--arch/riscv/Kconfig4
-rw-r--r--arch/riscv/kernel/setup.c3
-rw-r--r--arch/riscv/mm/init.c6
-rw-r--r--arch/s390/kernel/module.c2
-rw-r--r--arch/s390/kernel/numa.c7
-rw-r--r--arch/sh/kernel/topology.c5
-rw-r--r--arch/sparc/kernel/sysfs.c12
-rw-r--r--arch/sparc/mm/hugetlbpage.c1
-rw-r--r--arch/x86/Kconfig7
-rw-r--r--arch/x86/kernel/module.c2
-rw-r--r--arch/x86/kernel/setup.c10
-rw-r--r--arch/x86/kernel/topology.c5
-rw-r--r--arch/x86/mm/init.c1
-rw-r--r--block/bfq-iosched.c2
-rw-r--r--drivers/base/init.c1
-rw-r--r--drivers/base/memory.c8
-rw-r--r--drivers/base/node.c30
-rw-r--r--drivers/block/drbd/drbd_int.h3
-rw-r--r--drivers/block/drbd/drbd_req.c3
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_migrate.c34
-rw-r--r--drivers/iommu/tegra-smmu.c4
-rw-r--r--fs/binfmt_elf.c65
-rw-r--r--fs/buffer.c56
-rw-r--r--fs/ceph/addr.c27
-rw-r--r--fs/ceph/super.c2
-rw-r--r--fs/ceph/super.h2
-rw-r--r--fs/coredump.c39
-rw-r--r--fs/exec.c26
-rw-r--r--fs/ext2/ialloc.c5
-rw-r--r--fs/f2fs/compress.c6
-rw-r--r--fs/f2fs/data.c9
-rw-r--r--fs/f2fs/segment.c14
-rw-r--r--fs/f2fs/super.c8
-rw-r--r--fs/fs-writeback.c37
-rw-r--r--fs/fuse/control.c17
-rw-r--r--fs/fuse/dev.c8
-rw-r--r--fs/minix/inode.c3
-rw-r--r--fs/namei.c8
-rw-r--r--fs/namespace.c2
-rw-r--r--fs/nfs/sysctl.c7
-rw-r--r--fs/nfs/write.c53
-rw-r--r--fs/nilfs2/segbuf.c12
-rw-r--r--fs/ntfs/inode.c4
-rw-r--r--fs/ocfs2/file.c9
-rw-r--r--fs/ocfs2/namei.c49
-rw-r--r--fs/ocfs2/namei.h2
-rw-r--r--fs/ocfs2/refcounttree.c15
-rw-r--r--fs/ocfs2/stack_user.c18
-rw-r--r--fs/ocfs2/xattr.c12
-rw-r--r--fs/ocfs2/xattr.h1
-rw-r--r--fs/pipe.c13
-rw-r--r--fs/proc/base.c8
-rw-r--r--fs/proc/fd.c23
-rw-r--r--fs/proc/page.c40
-rw-r--r--fs/proc/task_mmu.c40
-rw-r--r--fs/proc/vmcore.c43
-rw-r--r--fs/xfs/xfs_buf.c3
-rw-r--r--include/linux/backing-dev-defs.h8
-rw-r--r--include/linux/backing-dev.h50
-rw-r--r--include/linux/balloon_compaction.h22
-rw-r--r--include/linux/bitfield.h3
-rw-r--r--include/linux/cma.h2
-rw-r--r--include/linux/compiler_types.h2
-rw-r--r--include/linux/damon.h13
-rw-r--r--include/linux/fs.h1
-rw-r--r--include/linux/gfp.h47
-rw-r--r--include/linux/highmem-internal.h10
-rw-r--r--include/linux/hugetlb.h8
-rw-r--r--include/linux/kallsyms.h27
-rw-r--r--include/linux/kasan.h97
-rw-r--r--include/linux/kexec.h12
-rw-r--r--include/linux/kfence.h2
-rw-r--r--include/linux/log2.h4
-rw-r--r--include/linux/memcontrol.h10
-rw-r--r--include/linux/memory_hotplug.h118
-rw-r--r--include/linux/mempolicy.h5
-rw-r--r--include/linux/memremap.h14
-rw-r--r--include/linux/migrate.h1
-rw-r--r--include/linux/mm.h29
-rw-r--r--include/linux/mm_inline.h11
-rw-r--r--include/linux/mm_types.h19
-rw-r--r--include/linux/mmzone.h25
-rw-r--r--include/linux/nfs_fs.h1
-rw-r--r--include/linux/nfs_fs_sb.h1
-rw-r--r--include/linux/node.h9
-rw-r--r--include/linux/page-flags.h90
-rw-r--r--include/linux/pagemap.h7
-rw-r--r--include/linux/rmap.h31
-rw-r--r--include/linux/sched.h8
-rw-r--r--include/linux/sched/sysctl.h10
-rw-r--r--include/linux/stddef.h6
-rw-r--r--include/linux/swap.h4
-rw-r--r--include/linux/thread_info.h5
-rw-r--r--include/linux/uaccess.h2
-rw-r--r--include/linux/vm_event_item.h3
-rw-r--r--include/linux/vmalloc.h18
-rw-r--r--include/linux/writeback.h15
-rw-r--r--include/trace/events/huge_memory.h1
-rw-r--r--include/trace/events/migrate.h31
-rw-r--r--include/trace/events/mmflags.h15
-rw-r--r--include/trace/events/thp.h27
-rw-r--r--include/trace/events/writeback.h28
-rw-r--r--init/main.c25
-rw-r--r--ipc/mqueue.c14
-rw-r--r--kernel/events/uprobes.c7
-rw-r--r--kernel/fork.c1
-rw-r--r--kernel/hung_task.c44
-rw-r--r--kernel/kcov.c98
-rw-r--r--kernel/panic.c24
-rw-r--r--kernel/sched/core.c21
-rw-r--r--kernel/scs.c4
-rw-r--r--kernel/sysctl.c71
-rw-r--r--lib/Kconfig.debug145
-rw-r--r--lib/Kconfig.kasan20
-rw-r--r--lib/Kconfig.kcsan11
-rw-r--r--lib/Kconfig.ubsan12
-rw-r--r--lib/lz4/lz4_decompress.c8
-rw-r--r--lib/test_hmm.c356
-rw-r--r--lib/test_hmm_uapi.h22
-rw-r--r--lib/test_kasan.c196
-rw-r--r--lib/test_printf.c8
-rw-r--r--lib/ubsan.c10
-rw-r--r--lib/vsprintf.c13
-rw-r--r--mm/Kconfig6
-rw-r--r--mm/backing-dev.c57
-rw-r--r--mm/balloon_compaction.c6
-rw-r--r--mm/cma.c11
-rw-r--r--mm/cma.h1
-rw-r--r--mm/damon/core-test.h21
-rw-r--r--mm/damon/core.c36
-rw-r--r--mm/damon/dbgfs-test.h83
-rw-r--r--mm/damon/dbgfs.c202
-rw-r--r--mm/damon/reclaim.c3
-rw-r--r--mm/damon/vaddr-test.h6
-rw-r--r--mm/damon/vaddr.c10
-rw-r--r--mm/debug.c1
-rw-r--r--mm/fadvise.c5
-rw-r--r--mm/filemap.c11
-rw-r--r--mm/gup.c255
-rw-r--r--mm/highmem.c9
-rw-r--r--mm/hmm.c3
-rw-r--r--mm/huge_memory.c186
-rw-r--r--mm/hugetlb.c10
-rw-r--r--mm/hugetlb_vmemmap.c68
-rw-r--r--mm/internal.h81
-rw-r--r--mm/kasan/common.c4
-rw-r--r--mm/kasan/hw_tags.c193
-rw-r--r--mm/kasan/kasan.h18
-rw-r--r--mm/kasan/report.c10
-rw-r--r--mm/kasan/shadow.c63
-rw-r--r--mm/kfence/core.c3
-rw-r--r--mm/kfence/kfence_test.c8
-rw-r--r--mm/khugepaged.c25
-rw-r--r--mm/ksm.c15
-rw-r--r--mm/maccess.c6
-rw-r--r--mm/madvise.c5
-rw-r--r--mm/memcontrol.c68
-rw-r--r--mm/memory-failure.c18
-rw-r--r--mm/memory.c168
-rw-r--r--mm/memory_hotplug.c76
-rw-r--r--mm/mempolicy.c33
-rw-r--r--mm/memremap.c37
-rw-r--r--mm/migrate.c184
-rw-r--r--mm/mlock.c634
-rw-r--r--mm/mmap.c32
-rw-r--r--mm/mmzone.c14
-rw-r--r--mm/mprotect.c13
-rw-r--r--mm/oom_kill.c2
-rw-r--r--mm/page-writeback.c103
-rw-r--r--mm/page_alloc.c316
-rw-r--r--mm/page_owner.c70
-rw-r--r--mm/percpu-vm.c8
-rw-r--r--mm/ptdump.c16
-rw-r--r--mm/readahead.c6
-rw-r--r--mm/rmap.c210
-rw-r--r--mm/sparse-vmemmap.c70
-rw-r--r--mm/sparse.c2
-rw-r--r--mm/swap.c89
-rw-r--r--mm/swapfile.c104
-rw-r--r--mm/usercopy.c11
-rw-r--r--mm/userfaultfd.c14
-rw-r--r--mm/vmalloc.c203
-rw-r--r--mm/vmscan.c50
-rw-r--r--mm/vmstat.c6
-rw-r--r--mm/zswap.c15
-rwxr-xr-xscripts/checkpatch.pl36
-rw-r--r--tools/include/linux/gfp.h3
-rw-r--r--tools/perf/builtin-kmem.c1
-rw-r--r--tools/testing/selftests/Makefile32
-rw-r--r--tools/testing/selftests/exec/Makefile1
-rw-r--r--tools/testing/selftests/exec/null-argv.c78
-rw-r--r--tools/testing/selftests/futex/functional/Makefile5
-rw-r--r--tools/testing/selftests/kselftest.h10
-rw-r--r--tools/testing/selftests/kvm/Makefile2
-rw-r--r--tools/testing/selftests/landlock/Makefile2
-rw-r--r--tools/testing/selftests/net/Makefile2
-rw-r--r--tools/testing/selftests/net/mptcp/Makefile2
-rw-r--r--tools/testing/selftests/vm/.gitignore1
-rw-r--r--tools/testing/selftests/vm/Makefile5
-rw-r--r--tools/testing/selftests/vm/hmm-tests.c204
-rw-r--r--tools/testing/selftests/vm/hugepage-mremap.c26
-rw-r--r--tools/testing/selftests/vm/hugepage-vmemmap.c144
-rw-r--r--tools/testing/selftests/vm/memfd_secret.c2
-rwxr-xr-xtools/testing/selftests/vm/run_vmtests.sh14
-rwxr-xr-xtools/testing/selftests/vm/test_hmm.sh24
-rw-r--r--tools/testing/selftests/vm/userfaultfd.c3
-rw-r--r--tools/vm/page_owner_sort.c216
236 files changed, 4571 insertions, 3279 deletions
diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index 5aa368d165dab..69d7a6983f781 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -1301,6 +1301,11 @@ PAGE_SIZE multiple when read back.
Amount of memory used to cache filesystem data,
including tmpfs and shared memory.
+ kernel (npn)
+ Amount of total kernel memory, including
+ (kernel_stack, pagetables, percpu, vmalloc, slab) in
+ addition to other kernel memory use cases.
+
kernel_stack
Amount of memory allocated to kernel stacks.
diff --git a/Documentation/admin-guide/kdump/kdump.rst b/Documentation/admin-guide/kdump/kdump.rst
index cb30ca3df27c9..a748e7eb4429b 100644
--- a/Documentation/admin-guide/kdump/kdump.rst
+++ b/Documentation/admin-guide/kdump/kdump.rst
@@ -146,9 +146,9 @@ System kernel config options
CONFIG_SYSFS=y
Note that "sysfs file system support" might not appear in the "Pseudo
- filesystems" menu if "Configure standard kernel features (for small
- systems)" is not enabled in "General Setup." In this case, check the
- .config file itself to ensure that sysfs is turned on, as follows::
+ filesystems" menu if "Configure standard kernel features (expert users)"
+ is not enabled in "General Setup." In this case, check the .config file
+ itself to ensure that sysfs is turned on, as follows::
grep 'CONFIG_SYSFS' .config
@@ -533,6 +533,10 @@ the following command::
cp /proc/vmcore <dump-file>
+or use scp to write out the dump file between hosts on a network, e.g::
+
+ scp /proc/vmcore remote_username@remote_ip:<dump-file>
+
You can also use makedumpfile utility to write out the dump file
with specified options to filter out unwanted contents, e.g::
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 4681419b7857b..3c2b3e24e8f50 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -1649,7 +1649,7 @@
[KNL] Reguires CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
enabled.
Allows heavy hugetlb users to free up some more
- memory (6 * PAGE_SIZE for each 2MB hugetlb page).
+ memory (7 * PAGE_SIZE for each 2MB hugetlb page).
Format: { on | off (default) }
on: enable the feature
@@ -3765,6 +3765,7 @@
bit 3: print locks info if CONFIG_LOCKDEP is on
bit 4: print ftrace buffer
bit 5: print all printk messages in buffer
+ bit 6: print all CPUs backtrace (if available in the arch)
panic_on_taint= Bitmask for conditionally calling panic() in add_taint()
Format: <hex>[,nousertaint]
diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst
index 59b84904a8543..1e06435b8ff67 100644
--- a/Documentation/admin-guide/mm/damon/usage.rst
+++ b/Documentation/admin-guide/mm/damon/usage.rst
@@ -108,19 +108,23 @@ In such cases, users can explicitly set the initial monitoring target regions
as they want, by writing proper values to the ``init_regions`` file. Each line
of the input should represent one region in below form.::
- <target id> <start address> <end address>
+ <target idx> <start address> <end address>
-The ``target id`` should already in ``target_ids`` file, and the regions should
-be passed in address order. For example, below commands will set a couple of
-address ranges, ``1-100`` and ``100-200`` as the initial monitoring target
-region of process 42, and another couple of address ranges, ``20-40`` and
-``50-100`` as that of process 4242.::
+The ``target idx`` should be the index of the target in ``target_ids`` file,
+starting from ``0``, and the regions should be passed in address order. For
+example, below commands will set a couple of address ranges, ``1-100`` and
+``100-200`` as the initial monitoring target region of pid 42, which is the
+first one (index ``0``) in ``target_ids``, and another couple of address
+ranges, ``20-40`` and ``50-100`` as that of pid 4242, which is the second one
+(index ``1``) in ``target_ids``.::
# cd <debugfs>/damon
- # echo "42 1 100
- 42 100 200
- 4242 20 40
- 4242 50 100" > init_regions
+ # cat target_ids
+ 42 4242
+ # echo "0 1 100
+ 0 100 200
+ 1 20 40
+ 1 50 100" > init_regions
Note that this sets the initial monitoring target regions only. In case of
virtual memory monitoring, DAMON will automatically updates the boundary of the
diff --git a/Documentation/admin-guide/mm/zswap.rst b/Documentation/admin-guide/mm/zswap.rst
index 8edb8d578caf7..6e6f7b0d6562b 100644
--- a/Documentation/admin-guide/mm/zswap.rst
+++ b/Documentation/admin-guide/mm/zswap.rst
@@ -130,9 +130,25 @@ attribute, e.g.::
echo 1 > /sys/module/zswap/parameters/same_filled_pages_enabled
When zswap same-filled page identification is disabled at runtime, it will stop
-checking for the same-value filled pages during store operation. However, the
-existing pages which are marked as same-value filled pages remain stored
-unchanged in zswap until they are either loaded or invalidated.
+checking for the same-value filled pages during store operation.
+In other words, every page will be then considered non-same-value filled.
+However, the existing pages which are marked as same-value filled pages remain
+stored unchanged in zswap until they are either loaded or invalidated.
+
+In some circumstances it might be advantageous to make use of just the zswap
+ability to efficiently store same-filled pages without enabling the whole
+compressed page storage.
+In this case the handling of non-same-value pages by zswap (enabled by default)
+can be disabled by setting the ``non_same_filled_pages_enabled`` attribute
+to 0, e.g. ``zswap.non_same_filled_pages_enabled=0``.
+It can also be enabled and disabled at runtime using the sysfs
+``non_same_filled_pages_enabled`` attribute, e.g.::
+
+ echo 1 > /sys/module/zswap/parameters/non_same_filled_pages_enabled
+
+Disabling both ``zswap.same_filled_pages_enabled`` and
+``zswap.non_same_filled_pages_enabled`` effectively disables accepting any new
+pages by zswap.
To prevent zswap from shrinking pool when zswap is full and there's a high
pressure on swap (this will result in flipping pages in and out zswap pool
diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst
index d359bcfadd39a..078e1aff4adcf 100644
--- a/Documentation/admin-guide/sysctl/kernel.rst
+++ b/Documentation/admin-guide/sysctl/kernel.rst
@@ -595,16 +595,23 @@ Documentation/admin-guide/kernel-parameters.rst).
numa_balancing
==============
-Enables/disables automatic page fault based NUMA memory
-balancing. Memory is moved automatically to nodes
-that access it often.
+Enables/disables and configures automatic page fault based NUMA memory
+balancing. Memory is moved automatically to nodes that access it often.
+The value to set can be the result of ORing the following::
-Enables/disables automatic NUMA memory balancing. On NUMA machines, there
-is a performance penalty if remote memory is accessed by a CPU. When this
-feature is enabled the kernel samples what task thread is accessing memory
-by periodically unmapping pages and later trapping a page fault. At the
-time of the page fault, it is determined if the data being accessed should
-be migrated to a local memory node.
+= =================================
+0 NUMA_BALANCING_DISABLED
+1 NUMA_BALANCING_NORMAL
+2 NUMA_BALANCING_MEMORY_TIERING
+= =================================
+
+Or NUMA_BALANCING_NORMAL to optimize page placement among different
+NUMA nodes to reduce remote accessing. On NUMA machines, there is a
+performance penalty if remote memory is accessed by a CPU. When this
+feature is enabled the kernel samples what task thread is accessing
+memory by periodically unmapping pages and later trapping a page
+fault. At the time of the page fault, it is determined if the data
+being accessed should be migrated to a local memory node.
The unmapping of pages and trapping faults incur additional overhead that
ideally is offset by improved memory locality but there is no universal
@@ -615,6 +622,10 @@ faults may be controlled by the `numa_balancing_scan_period_min_ms,
numa_balancing_scan_delay_ms, numa_balancing_scan_period_max_ms,
numa_balancing_scan_size_mb`_, and numa_balancing_settle_count sysctls.
+Or NUMA_BALANCING_MEMORY_TIERING to optimize page placement among
+different types of memory (represented as different NUMA nodes) to
+place the hot pages in the fast memory. This is implemented based on
+unmapping and page fault too.
numa_balancing_scan_period_min_ms, numa_balancing_scan_delay_ms, numa_balancing_scan_period_max_ms, numa_balancing_scan_size_mb
===============================================================================================================================
@@ -795,6 +806,8 @@ bit 1 print system memory info
bit 2 print timer info
bit 3 print locks info if ``CONFIG_LOCKDEP`` is on
bit 4 print ftrace buffer
+bit 5 print all printk messages in buffer
+bit 6 print all CPUs backtrace (if available in the arch)
===== ============================================
So for example to print tasks and memory info on panic, user can::
diff --git a/Documentation/dev-tools/kasan.rst b/Documentation/dev-tools/kasan.rst
index 8089c559d339c..7614a1fc30fac 100644
--- a/Documentation/dev-tools/kasan.rst
+++ b/Documentation/dev-tools/kasan.rst
@@ -30,7 +30,7 @@ Software tag-based KASAN mode is only supported in Clang.
The hardware KASAN mode (#3) relies on hardware to perform the checks but
still requires a compiler version that supports memory tagging instructions.
-This mode is supported in GCC 10+ and Clang 11+.
+This mode is supported in GCC 10+ and Clang 12+.
Both software KASAN modes work with SLUB and SLAB memory allocators,
while the hardware tag-based KASAN currently only supports SLUB.
@@ -206,6 +206,9 @@ additional boot parameters that allow disabling KASAN or controlling features:
Asymmetric mode: a bad access is detected synchronously on reads and
asynchronously on writes.
+- ``kasan.vmalloc=off`` or ``=on`` disables or enables tagging of vmalloc
+ allocations (default: ``on``).
+
- ``kasan.stacktrace=off`` or ``=on`` disables or enables alloc and free stack
traces collection (default: ``on``).
@@ -279,8 +282,8 @@ Software tag-based KASAN uses 0xFF as a match-all pointer tag (accesses through
pointers with the 0xFF pointer tag are not checked). The value 0xFE is currently
reserved to tag freed memory regions.
-Software tag-based KASAN currently only supports tagging of slab and page_alloc
-memory.
+Software tag-based KASAN currently only supports tagging of slab, page_alloc,
+and vmalloc memory.
Hardware tag-based KASAN
~~~~~~~~~~~~~~~~~~~~~~~~
@@ -303,8 +306,8 @@ Hardware tag-based KASAN uses 0xFF as a match-all pointer tag (accesses through
pointers with the 0xFF pointer tag are not checked). The value 0xFE is currently
reserved to tag freed memory regions.
-Hardware tag-based KASAN currently only supports tagging of slab and page_alloc
-memory.
+Hardware tag-based KASAN currently only supports tagging of slab, page_alloc,
+and VM_ALLOC-based vmalloc memory.
If the hardware does not support MTE (pre ARMv8.5), hardware tag-based KASAN
will not be enabled. In this case, all KASAN boot parameters are ignored.
@@ -319,6 +322,8 @@ checking gets disabled.
Shadow memory
-------------
+The contents of this section are only applicable to software KASAN modes.
+
The kernel maps memory in several different parts of the address space.
The range of kernel virtual addresses is large: there is not enough real
memory to support a real shadow region for every address that could be
@@ -349,7 +354,7 @@ CONFIG_KASAN_VMALLOC
With ``CONFIG_KASAN_VMALLOC``, KASAN can cover vmalloc space at the
cost of greater memory usage. Currently, this is supported on x86,
-riscv, s390, and powerpc.
+arm64, riscv, s390, and powerpc.
This works by hooking into vmalloc and vmap and dynamically
allocating real shadow memory to back the mappings.
diff --git a/Documentation/vm/balance.rst b/Documentation/vm/balance.rst
index 6a1fadf3e1735..e38e9d83c1c72 100644
--- a/Documentation/vm/balance.rst
+++ b/Documentation/vm/balance.rst
@@ -6,7 +6,7 @@ Memory Balancing
Started Jan 2000 by Kanoj Sarcar <kanoj@sgi.com>
-Memory balancing is needed for !__GFP_ATOMIC and !__GFP_KSWAPD_RECLAIM as
+Memory balancing is needed for !__GFP_HIGH and !__GFP_KSWAPD_RECLAIM as
well as for non __GFP_IO allocations.
The first reason why a caller may avoid reclaim is that the caller can not
diff --git a/Documentation/vm/page_owner.rst b/Documentation/vm/page_owner.rst
index 9837fc8147dd6..2b54e82b9fe15 100644
--- a/Documentation/vm/page_owner.rst
+++ b/Documentation/vm/page_owner.rst
@@ -89,22 +89,41 @@ Usage
Page allocated via order XXX, ...
PFN XXX ...
- // Detailed stack
+ // Detailed stack
Page allocated via order XXX, ...
PFN XXX ...
- // Detailed stack
+ // Detailed stack
The ``page_owner_sort`` tool ignores ``PFN`` rows, puts the remaining rows
in buf, uses regexp to extract the page order value, counts the times
- and pages of buf, and finally sorts them according to the times.
+ and pages of buf, and finally sorts them according to the parameter(s).
See the result about who allocated each page
in the ``sorted_page_owner.txt``. General output:
XXX times, XXX pages:
Page allocated via order XXX, ...
- // Detailed stack
+ // Detailed stack
By default, ``page_owner_sort`` is sorted according to the times of buf.
- If you want to sort by the pages nums of buf, use the ``-m`` parameter.
+ If you want to sort by the page nums of buf, use the ``-m`` parameter.
+ The detailed parameters are:
+
+ fundamental function:
+
+ Sort:
+ -a Sort by memory allocation time.
+ -m Sort by total memory.
+ -p Sort by pid.
+ -r Sort by memory release time.
+ -s Sort by stack trace.
+ -t Sort by times (default).
+
+ additional function:
+
+ Cull:
+ -c Cull by comparing stacktrace instead of total block.
+
+ Filter:
+ -f Filter out the information of blocks whose memory has not been released.
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 72150cb8db028..4d94ffacb8680 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -37,6 +37,7 @@ config ARM
select ARCH_USE_CMPXCHG_LOCKREF
select ARCH_USE_MEMTEST
select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU
+ select ARCH_WANT_GENERAL_HUGETLB
select ARCH_WANT_IPC_PARSE_VERSION
select ARCH_WANT_LD_ORPHAN_WARN
select BINFMT_FLAT_ARGVP_ENVP_ON_STACK
@@ -1514,9 +1515,6 @@ config HW_PERF_EVENTS
def_bool y
depends on ARM_PMU
-config ARCH_WANT_GENERAL_HUGETLB
- def_bool y
-
config ARM_MODULE_PLTS
bool "Use PLTs to allow module memory to spill over into vmalloc area"
depends on MODULES
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 7c3081e566699..8719051b45274 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -23,6 +23,7 @@ config ARM64
select ARCH_HAS_DMA_PREP_COHERENT
select ARCH_HAS_ACPI_TABLE_UPGRADE if ACPI
select ARCH_HAS_FAST_MULTIPLIER
+ select ARCH_HAS_FILTER_PGPROT
select ARCH_HAS_FORTIFY_SOURCE
select ARCH_HAS_GCOV_PROFILE_ALL
select ARCH_HAS_GIGANTIC_PAGE
@@ -205,7 +206,7 @@ config ARM64
select IOMMU_DMA if IOMMU_SUPPORT
select IRQ_DOMAIN
select IRQ_FORCED_THREADING
- select KASAN_VMALLOC if KASAN_GENERIC
+ select KASAN_VMALLOC if KASAN
select MODULES_USE_ELF_RELA
select NEED_DMA_MAP_STATE
select NEED_SG_DMA_LENGTH
@@ -1252,9 +1253,6 @@ config HW_PERF_EVENTS
def_bool y
depends on ARM_PMU
-config ARCH_HAS_FILTER_PGPROT
- def_bool y
-
# Supported by clang >= 7.0
config CC_HAVE_SHADOW_CALL_STACK
def_bool $(cc-option, -fsanitize=shadow-call-stack -ffixed-x18)
diff --git a/arch/arm64/include/asm/vmalloc.h b/arch/arm64/include/asm/vmalloc.h
index b9185503feae2..38fafffe699f7 100644
--- a/arch/arm64/include/asm/vmalloc.h
+++ b/arch/arm64/include/asm/vmalloc.h
@@ -25,4 +25,10 @@ static inline bool arch_vmap_pmd_supported(pgprot_t prot)
#endif
+#define arch_vmap_pgprot_tagged arch_vmap_pgprot_tagged
+static inline pgprot_t arch_vmap_pgprot_tagged(pgprot_t prot)
+{
+ return pgprot_tagged(prot);
+}
+
#endif /* _ASM_ARM64_VMALLOC_H */
diff --git a/arch/arm64/include/asm/vmap_stack.h b/arch/arm64/include/asm/vmap_stack.h
index 894e031b28d28..20873099c035c 100644
--- a/arch/arm64/include/asm/vmap_stack.h
+++ b/arch/arm64/include/asm/vmap_stack.h
@@ -17,10 +17,13 @@
*/
static inline unsigned long *arch_alloc_vmap_stack(size_t stack_size, int node)
{
+ void *p;
+
BUILD_BUG_ON(!IS_ENABLED(CONFIG_VMAP_STACK));
- return __vmalloc_node(stack_size, THREAD_ALIGN, THREADINFO_GFP, node,
+ p = __vmalloc_node(stack_size, THREAD_ALIGN, THREADINFO_GFP, node,
__builtin_return_address(0));
+ return kasan_reset_tag(p);
}
#endif /* __ASM_VMAP_STACK_H */
diff --git a/arch/arm64/kernel/module.c b/arch/arm64/kernel/module.c
index 309a27553c875..f2d4bb14bfabe 100644
--- a/arch/arm64/kernel/module.c
+++ b/arch/arm64/kernel/module.c
@@ -58,12 +58,13 @@ void *module_alloc(unsigned long size)
PAGE_KERNEL, 0, NUMA_NO_NODE,
__builtin_return_address(0));
- if (p && (kasan_module_alloc(p, size, gfp_mask) < 0)) {
+ if (p && (kasan_alloc_module_shadow(p, size, gfp_mask) < 0)) {
vfree(p);
return NULL;
}
- return p;
+ /* Memory is intended to be executable, reset the pointer tag. */
+ return kasan_reset_tag(p);
}
enum aarch64_reloc_op {
diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c
index f70573928f1bf..3505789cf4bd9 100644
--- a/arch/arm64/kernel/setup.c
+++ b/arch/arm64/kernel/setup.c
@@ -406,9 +406,6 @@ static int __init topology_init(void)
{
int i;
- for_each_online_node(i)
- register_one_node(i);
-
for_each_possible_cpu(i) {
struct cpu *cpu = &per_cpu(cpu_data.cpu, i);
cpu->hotpluggable = cpu_can_disable(i);
diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
index ffb9c229610ab..228226c5fa809 100644
--- a/arch/arm64/mm/hugetlbpage.c
+++ b/arch/arm64/mm/hugetlbpage.c
@@ -347,6 +347,7 @@ pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t flags)
{
size_t pagesize = 1UL << shift;
+ entry = pte_mkhuge(entry);
if (pagesize == CONT_PTE_SIZE) {
entry = pte_mkcont(entry);
} else if (pagesize == CONT_PMD_SIZE) {
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index db63cc885771a..3973e305adc89 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -64,7 +64,6 @@ EXPORT_SYMBOL(memstart_addr);
*/
phys_addr_t arm64_dma_phys_limit __ro_after_init;
-#ifdef CONFIG_KEXEC_CORE
/*
* reserve_crashkernel() - reserves memory for crash kernel
*
@@ -78,6 +77,9 @@ static void __init reserve_crashkernel(void)
unsigned long long crash_max = arm64_dma_phys_limit;
int ret;
+ if (!IS_ENABLED(CONFIG_KEXEC_CORE))
+ return;
+
ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(),
&crash_size, &crash_base);
/* no crashkernel= or invalid value specified */
@@ -110,11 +112,6 @@ static void __init reserve_crashkernel(void)
crashk_res.start = crash_base;
crashk_res.end = crash_base + crash_size - 1;
}
-#else
-static void __init reserve_crashkernel(void)
-{
-}
-#endif /* CONFIG_KEXEC_CORE */
/*
* Return the maximum physical address for a zone accessible by the given bits
diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c
index a3bacd79507a4..64e985eaa52d8 100644
--- a/arch/arm64/mm/pageattr.c
+++ b/arch/arm64/mm/pageattr.c
@@ -85,7 +85,7 @@ static int change_memory_common(unsigned long addr, int numpages,
*/
area = find_vm_area((void *)addr);
if (!area ||
- end > (unsigned long)area->addr + area->size ||
+ end > (unsigned long)kasan_reset_tag(area->addr) + area->size ||
!(area->flags & VM_ALLOC))
return -EINVAL;
diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c
index 74f9a9b6a0530..9ea348149a69f 100644
--- a/arch/arm64/net/bpf_jit_comp.c
+++ b/arch/arm64/net/bpf_jit_comp.c
@@ -1155,7 +1155,8 @@ u64 bpf_jit_alloc_exec_limit(void)
void *bpf_jit_alloc_exec(unsigned long size)
{
- return vmalloc(size);
+ /* Memory is intended to be executable, reset the pointer tag. */
+ return kasan_reset_tag(vmalloc(size));
}
void bpf_jit_free_exec(void *addr)
diff --git a/arch/hexagon/mm/init.c b/arch/hexagon/mm/init.c
index f01e91e10d95d..3167a3b5c97b0 100644
--- a/arch/hexagon/mm/init.c
+++ b/arch/hexagon/mm/init.c
@@ -29,8 +29,6 @@ int max_kernel_seg = 0x303;
/* indicate pfn's of high memory */
unsigned long highstart_pfn, highend_pfn;
-DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
-
/* Default cache attribute for newly created page tables */
unsigned long _dflt_cache_att = CACHEDEF;
diff --git a/arch/ia64/kernel/topology.c b/arch/ia64/kernel/topology.c
index e4992917a24b7..94a848b06f15a 100644
--- a/arch/ia64/kernel/topology.c
+++ b/arch/ia64/kernel/topology.c
@@ -70,16 +70,6 @@ static int __init topology_init(void)
{
int i, err = 0;
-#ifdef CONFIG_NUMA
- /*
- * MCD - Do we want to register all ONLINE nodes, or all POSSIBLE nodes?
- */
- for_each_online_node(i) {
- if ((err = register_one_node(i)))
- goto out;
- }
-#endif
-
sysfs_cpus = kcalloc(NR_CPUS, sizeof(struct ia64_cpu), GFP_KERNEL);
if (!sysfs_cpus)
panic("kzalloc in topology_init failed - NR_CPUS too big?");
diff --git a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c
index 791d4176e4a6b..73d0db36edb60 100644
--- a/arch/ia64/mm/discontig.c
+++ b/arch/ia64/mm/discontig.c
@@ -608,17 +608,11 @@ void __init paging_init(void)
zero_page_memmap_ptr = virt_to_page(ia64_imva(empty_zero_page));
}
-#ifdef CONFIG_MEMORY_HOTPLUG
-pg_data_t *arch_alloc_nodedata(int nid)
+pg_data_t * __init arch_alloc_nodedata(int nid)
{
unsigned long size = compute_pernodesize(nid);
- return kzalloc(size, GFP_KERNEL);
-}
-
-void arch_free_nodedata(pg_data_t *pgdat)
-{
- kfree(pgdat);
+ return memblock_alloc(size, SMP_CACHE_BYTES);
}
void arch_refresh_nodedata(int update_node, pg_data_t *update_pgdat)
@@ -626,7 +620,6 @@ void arch_refresh_nodedata(int update_node, pg_data_t *update_pgdat)
pgdat_list[update_node] = update_pgdat;
scatter_node_data();
}
-#endif
#ifdef CONFIG_SPARSEMEM_VMEMMAP
int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
diff --git a/arch/mips/kernel/topology.c b/arch/mips/kernel/topology.c
index 08ad6371fbe08..9429d85a4703c 100644
--- a/arch/mips/kernel/topology.c
+++ b/arch/mips/kernel/topology.c
@@ -12,11 +12,6 @@ static int __init topology_init(void)
{
int i, ret;
-#ifdef CONFIG_NUMA
- for_each_online_node(i)
- register_one_node(i);
-#endif /* CONFIG_NUMA */
-
for_each_present_cpu(i) {
struct cpu *c = &per_cpu(cpu_devices, i);
diff --git a/arch/nds32/mm/init.c b/arch/nds32/mm/init.c
index f63f839738c46..825c85cab1a1d 100644
--- a/arch/nds32/mm/init.c
+++ b/arch/nds32/mm/init.c
@@ -18,7 +18,6 @@
#include <asm/tlb.h>
#include <asm/page.h>
-DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
DEFINE_SPINLOCK(anon_alias_lock);
extern pgd_t swapper_pg_dir[PTRS_PER_PGD];
diff --git a/arch/openrisc/mm/init.c b/arch/openrisc/mm/init.c
index 97305bde1b169..3a021ab6f1aef 100644
--- a/arch/openrisc/mm/init.c
+++ b/arch/openrisc/mm/init.c
@@ -38,8 +38,6 @@
int mem_init_done;
-DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
-
static void __init zone_sizes_init(void)
{
unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0 };
diff --git a/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h b/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h
index 64b6c608eca43..de092b04ee1a1 100644
--- a/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h
+++ b/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h
@@ -71,9 +71,9 @@ static inline pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags
size_t size = 1UL << shift;
if (size == SZ_16K)
- return __pte(pte_val(entry) & ~_PAGE_HUGE);
+ return __pte(pte_val(entry) | _PAGE_SPS);
else
- return entry;
+ return __pte(pte_val(entry) | _PAGE_SPS | _PAGE_HUGE);
}
#define arch_make_huge_pte arch_make_huge_pte
#endif
diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
index d03e488cfe9ca..d0ad86b67e665 100644
--- a/arch/powerpc/kernel/fadump.c
+++ b/arch/powerpc/kernel/fadump.c
@@ -113,6 +113,12 @@ static int __init fadump_cma_init(void)
}
/*
+ * If CMA activation fails, keep the pages reserved, instead of
+ * exposing them to buddy allocator. Same as 'fadump=nocma' case.
+ */
+ cma_reserve_pages_on_error(fadump_cma);
+
+ /*
* So we now have successfully initialized cma area for fadump.
*/
pr_info("Initialized 0x%lx bytes cma area at %ldMB from 0x%lx "
diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c
index d45a415d5374b..2069bbb90a9a3 100644
--- a/arch/powerpc/kernel/sysfs.c
+++ b/arch/powerpc/kernel/sysfs.c
@@ -1110,14 +1110,6 @@ EXPORT_SYMBOL_GPL(cpu_remove_dev_attr_group);
/* NUMA stuff */
#ifdef CONFIG_NUMA
-static void __init register_nodes(void)
-{
- int i;
-
- for (i = 0; i < MAX_NUMNODES; i++)
- register_one_node(i);
-}
-
int sysfs_add_device_to_node(struct device *dev, int nid)
{
struct node *node = node_devices[nid];
@@ -1132,13 +1124,6 @@ void sysfs_remove_device_from_node(struct device *dev, int nid)
sysfs_remove_link(&node->dev.kobj, kobject_name(&dev->kobj));
}
EXPORT_SYMBOL_GPL(sysfs_remove_device_from_node);
-
-#else
-static void __init register_nodes(void)
-{
- return;
-}
-
#endif
/* Only valid if CPU is present. */
@@ -1155,8 +1140,6 @@ static int __init topology_init(void)
{
int cpu, r;
- register_nodes();
-
for_each_possible_cpu(cpu) {
struct cpu *c = &per_cpu(cpu_devices, cpu);
diff --git a/arch/powerpc/mm/book3s64/trace.c b/arch/powerpc/mm/book3s64/trace.c
index b86e7b9062571..ccd64b5e6cac7 100644
--- a/arch/powerpc/mm/book3s64/trace.c
+++ b/arch/powerpc/mm/book3s64/trace.c
@@ -3,6 +3,5 @@
* This file is for defining trace points and trace related helpers.
*/
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-#define CREATE_TRACE_POINTS
#include <trace/events/thp.h>
#endif
diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index 5adcbd9b5e886..0804b9a11934d 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -40,6 +40,7 @@ config RISCV
select ARCH_USE_MEMTEST
select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU
select ARCH_WANT_FRAME_POINTERS
+ select ARCH_WANT_GENERAL_HUGETLB
select ARCH_WANT_HUGE_PMD_SHARE if 64BIT
select BINFMT_FLAT_NO_DATA_START_OFFSET if !MMU
select BUILDTIME_TABLE_SORT if MMU
@@ -171,9 +172,6 @@ config ARCH_SPARSEMEM_ENABLE
config ARCH_SELECT_MEMORY_MODEL
def_bool ARCH_SPARSEMEM_ENABLE
-config ARCH_WANT_GENERAL_HUGETLB
- def_bool y
-
config ARCH_SUPPORTS_UPROBES
def_bool y
diff --git a/arch/riscv/kernel/setup.c b/arch/riscv/kernel/setup.c
index b42bfdc674823..834eb652a7b9d 100644
--- a/arch/riscv/kernel/setup.c
+++ b/arch/riscv/kernel/setup.c
@@ -301,9 +301,6 @@ static int __init topology_init(void)
{
int i, ret;
- for_each_online_node(i)
- register_one_node(i);
-
for_each_possible_cpu(i) {
struct cpu *cpu = &per_cpu(cpu_devices, i);
diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
index c27294128e182..ff2f41b3b558d 100644
--- a/arch/riscv/mm/init.c
+++ b/arch/riscv/mm/init.c
@@ -957,7 +957,6 @@ static inline void setup_vm_final(void)
}
#endif /* CONFIG_MMU */
-#ifdef CONFIG_KEXEC_CORE
/*
* reserve_crashkernel() - reserves memory for crash kernel
*
@@ -974,6 +973,8 @@ static void __init reserve_crashkernel(void)
int ret = 0;
+ if (!IS_ENABLED(CONFIG_KEXEC_CORE))
+ return;
/*
* Don't reserve a region for a crash kernel on a crash kernel
* since it doesn't make much sense and we have limited memory
@@ -1023,7 +1024,6 @@ static void __init reserve_crashkernel(void)
crashk_res.start = crash_base;
crashk_res.end = crash_base + crash_size - 1;
}
-#endif /* CONFIG_KEXEC_CORE */
void __init paging_init(void)
{
@@ -1037,9 +1037,7 @@ void __init misc_mem_init(void)
arch_numa_init();
sparse_init();
zone_sizes_init();
-#ifdef CONFIG_KEXEC_CORE
reserve_crashkernel();
-#endif
memblock_dump_all();
}
diff --git a/arch/s390/kernel/module.c b/arch/s390/kernel/module.c
index b032e556eeb71..a7aefc278909b 100644
--- a/arch/s390/kernel/module.c
+++ b/arch/s390/kernel/module.c
@@ -45,7 +45,7 @@ void *module_alloc(unsigned long size)
p = __vmalloc_node_range(size, MODULE_ALIGN, MODULES_VADDR, MODULES_END,
gfp_mask, PAGE_KERNEL_EXEC, VM_DEFER_KMEMLEAK, NUMA_NO_NODE,
__builtin_return_address(0));
- if (p && (kasan_module_alloc(p, size, gfp_mask) < 0)) {
+ if (p && (kasan_alloc_module_shadow(p, size, gfp_mask) < 0)) {
vfree(p);
return NULL;
}
diff --git a/arch/s390/kernel/numa.c b/arch/s390/kernel/numa.c
index 51c5a9f6e5257..23ab9f02f2787 100644
--- a/arch/s390/kernel/numa.c
+++ b/arch/s390/kernel/numa.c
@@ -33,10 +33,3 @@ void __init numa_setup(void)
NODE_DATA(0)->node_spanned_pages = memblock_end_of_DRAM() >> PAGE_SHIFT;
NODE_DATA(0)->node_id = 0;
}
-
-static int __init numa_init_late(void)
-{
- register_one_node(0);
- return 0;
-}
-arch_initcall(numa_init_late);
diff --git a/arch/sh/kernel/topology.c b/arch/sh/kernel/topology.c
index 76af6db9daa23..2d2a7509b565a 100644
--- a/arch/sh/kernel/topology.c
+++ b/arch/sh/kernel/topology.c
@@ -46,11 +46,6 @@ static int __init topology_init(void)
{
int i, ret;
-#ifdef CONFIG_NUMA
- for_each_online_node(i)
- register_one_node(i);
-#endif
-
for_each_present_cpu(i) {
struct cpu *c = &per_cpu(cpu_devices, i);
diff --git a/arch/sparc/kernel/sysfs.c b/arch/sparc/kernel/sysfs.c
index 6d60d416f0dd7..f19487e4cc71e 100644
--- a/arch/sparc/kernel/sysfs.c
+++ b/arch/sparc/kernel/sysfs.c
@@ -244,22 +244,10 @@ static void __init check_mmu_stats(void)
mmu_stats_supported = 1;
}
-static void register_nodes(void)
-{
-#ifdef CONFIG_NUMA
- int i;
-
- for (i = 0; i < MAX_NUMNODES; i++)
- register_one_node(i);
-#endif
-}
-
static int __init topology_init(void)
{
int cpu, ret;
- register_nodes();
-
check_mmu_stats();
for_each_possible_cpu(cpu) {
diff --git a/arch/sparc/mm/hugetlbpage.c b/arch/sparc/mm/hugetlbpage.c
index 0f49fada20938..d8e0e3c7038d0 100644
--- a/arch/sparc/mm/hugetlbpage.c
+++ b/arch/sparc/mm/hugetlbpage.c
@@ -181,6 +181,7 @@ pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t flags)
{
pte_t pte;
+ entry = pte_mkhuge(entry);
pte = hugepage_shift_to_tte(entry, shift);
#ifdef CONFIG_SPARC64
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 391c4cac8958f..1c7e772a7403c 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -118,6 +118,7 @@ config X86
select ARCH_WANT_DEFAULT_BPF_JIT if X86_64
select ARCH_WANTS_DYNAMIC_TASK_STRUCT
select ARCH_WANTS_NO_INSTR
+ select ARCH_WANT_GENERAL_HUGETLB
select ARCH_WANT_HUGE_PMD_SHARE
select ARCH_WANT_LD_ORPHAN_WARN
select ARCH_WANTS_THP_SWAP if X86_64
@@ -333,9 +334,6 @@ config GENERIC_CALIBRATE_DELAY
config ARCH_HAS_CPU_RELAX
def_bool y
-config ARCH_HAS_FILTER_PGPROT
- def_bool y
-
config ARCH_HIBERNATION_POSSIBLE
def_bool y
@@ -347,9 +345,6 @@ config ARCH_NR_GPIO
config ARCH_SUSPEND_POSSIBLE
def_bool y
-config ARCH_WANT_GENERAL_HUGETLB
- def_bool y
-
config AUDIT_ARCH
def_bool y if X86_64
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index 95fa745e310a5..c9eb8aa3b7b8a 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -78,7 +78,7 @@ void *module_alloc(unsigned long size)
MODULES_END, gfp_mask,
PAGE_KERNEL, VM_DEFER_KMEMLEAK, NUMA_NO_NODE,
__builtin_return_address(0));
- if (p && (kasan_module_alloc(p, size, gfp_mask) < 0)) {
+ if (p && (kasan_alloc_module_shadow(p, size, gfp_mask) < 0)) {
vfree(p);
return NULL;
}
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index f7a132eb794d8..af2d2dc438a20 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -391,8 +391,6 @@ static void __init memblock_x86_reserve_range_setup_data(void)
* --------- Crashkernel reservation ------------------------------
*/
-#ifdef CONFIG_KEXEC_CORE
-
/* 16M alignment for crash kernel regions */
#define CRASH_ALIGN SZ_16M
@@ -470,6 +468,9 @@ static void __init reserve_crashkernel(void)
bool high = false;
int ret;
+ if (!IS_ENABLED(CONFIG_KEXEC_CORE))
+ return;
+
total_mem = memblock_phys_mem_size();
/* crashkernel=XM */
@@ -535,11 +536,6 @@ static void __init reserve_crashkernel(void)
crashk_res.end = crash_base + crash_size - 1;
insert_resource(&iomem_resource, &crashk_res);
}
-#else
-static void __init reserve_crashkernel(void)
-{
-}
-#endif
static struct resource standard_io_resources[] = {
{ .name = "dma1", .start = 0x00, .end = 0x1f,
diff --git a/arch/x86/kernel/topology.c b/arch/x86/kernel/topology.c
index bd83748e2bde3..8617d1ed9d31b 100644
--- a/arch/x86/kernel/topology.c
+++ b/arch/x86/kernel/topology.c
@@ -154,11 +154,6 @@ static int __init topology_init(void)
{
int i;
-#ifdef CONFIG_NUMA
- for_each_online_node(i)
- register_one_node(i);
-#endif
-
for_each_present_cpu(i)
arch_register_cpu(i);
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 4ba024d5b63ae..d8cfce221275e 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -31,7 +31,6 @@
* We need to define the tracepoints somewhere, and tlb.c
* is only compiled when SMP=y.
*/
-#define CREATE_TRACE_POINTS
#include <trace/events/tlb.h>
#include "mm_internal.h"
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 0c612a9116967..4e645ae1e0665 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -5448,7 +5448,7 @@ static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio)
bfqq = bic_to_bfqq(bic, false);
if (bfqq) {
bfq_release_process_ref(bfqd, bfqq);
- bfqq = bfq_get_queue(bfqd, bio, BLK_RW_ASYNC, bic, true);
+ bfqq = bfq_get_queue(bfqd, bio, false, bic, true);
bic_set_bfqq(bic, bfqq, false);
}
diff --git a/drivers/base/init.c b/drivers/base/init.c
index a9f57c22fb9e2..d8d0fe687111a 100644
--- a/drivers/base/init.c
+++ b/drivers/base/init.c
@@ -35,5 +35,6 @@ void __init driver_init(void)
auxiliary_bus_init();
cpu_dev_init();
memory_dev_init();
+ node_dev_init();
container_dev_init();
}
diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 365cd4a7f2397..60c38f9cf1a75 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -663,14 +663,16 @@ static int init_memory_block(unsigned long block_id, unsigned long state,
mem->nr_vmemmap_pages = nr_vmemmap_pages;
INIT_LIST_HEAD(&mem->group_next);
+ ret = register_memory(mem);
+ if (ret)
+ return ret;
+
if (group) {
mem->group = group;
list_add(&mem->group_next, &group->memory_blocks);
}
- ret = register_memory(mem);
-
- return ret;
+ return 0;
}
static int add_memory_block(unsigned long base_section_nr)
diff --git a/drivers/base/node.c b/drivers/base/node.c
index 87acc47e89515..a133981a12fc6 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -1065,26 +1065,30 @@ static const struct attribute_group *cpu_root_attr_groups[] = {
};
#define NODE_CALLBACK_PRI 2 /* lower than SLAB */
-static int __init register_node_type(void)
+void __init node_dev_init(void)
{
- int ret;
+ static struct notifier_block node_memory_callback_nb = {
+ .notifier_call = node_memory_callback,
+ .priority = NODE_CALLBACK_PRI,
+ };
+ int ret, i;
BUILD_BUG_ON(ARRAY_SIZE(node_state_attr) != NR_NODE_STATES);
BUILD_BUG_ON(ARRAY_SIZE(node_state_attrs)-1 != NR_NODE_STATES);
ret = subsys_system_register(&node_subsys, cpu_root_attr_groups);
- if (!ret) {
- static struct notifier_block node_memory_callback_nb = {
- .notifier_call = node_memory_callback,
- .priority = NODE_CALLBACK_PRI,
- };
- register_hotmemory_notifier(&node_memory_callback_nb);
- }
+ if (ret)
+ panic("%s() failed to register subsystem: %d\n", __func__, ret);
+
+ register_hotmemory_notifier(&node_memory_callback_nb);
/*
- * Note: we're not going to unregister the node class if we fail
- * to register the node state class attribute files.
+ * Create all node devices, which will properly link the node
+ * to applicable memory block devices and already created cpu devices.
*/
- return ret;
+ for_each_online_node(i) {
+ ret = register_one_node(i);
+ if (ret)
+ panic("%s() failed to add node: %d\n", __func__, ret);
+ }
}
-postcore_initcall(register_node_type);
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index acb1ad3c06035..4b55e864a0a34 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -637,9 +637,6 @@ enum {
STATE_SENT, /* Do not change state/UUIDs while this is set */
CALLBACK_PENDING, /* Whether we have a call_usermodehelper(, UMH_WAIT_PROC)
* pending, from drbd worker context.
- * If set, bdi_write_congested() returns true,
- * so shrink_page_list() would not recurse into,
- * and potentially deadlock on, this drbd worker.
*/
DISCONNECT_SENT,
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index c00ae8619519e..2f608525a8790 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -910,8 +910,7 @@ static bool remote_due_to_read_balancing(struct drbd_device *device, sector_t se
switch (rbm) {
case RB_CONGESTED_REMOTE:
- return bdi_read_congested(
- device->ldev->backing_bdev->bd_disk->bdi);
+ return 0;
case RB_LEAST_PENDING:
return atomic_read(&device->local_cnt) >
atomic_read(&device->ap_pending_cnt) + atomic_read(&device->rs_pending_cnt);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
index d986f9ee0e1f4..8430f64757233 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
@@ -656,9 +656,12 @@ svm_migrate_vma_to_ram(struct amdgpu_device *adev, struct svm_range *prange,
migrate.vma = vma;
migrate.start = start;
migrate.end = end;
- migrate.flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE;
migrate.pgmap_owner = SVM_ADEV_PGMAP_OWNER(adev);
+ if (adev->gmc.xgmi.connected_to_cpu)
+ migrate.flags = MIGRATE_VMA_SELECT_DEVICE_COHERENT;
+ else
+ migrate.flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE;
size = 2 * sizeof(*migrate.src) + sizeof(uint64_t) + sizeof(dma_addr_t);
size *= npages;
buf = kvmalloc(size, GFP_KERNEL | __GFP_ZERO);
@@ -930,7 +933,7 @@ int svm_migrate_init(struct amdgpu_device *adev)
{
struct kfd_dev *kfddev = adev->kfd.dev;
struct dev_pagemap *pgmap;
- struct resource *res;
+ struct resource *res = NULL;
unsigned long size;
void *r;
@@ -945,28 +948,34 @@ int svm_migrate_init(struct amdgpu_device *adev)
* should remove reserved size
*/
size = ALIGN(adev->gmc.real_vram_size, 2ULL << 20);
- res = devm_request_free_mem_region(adev->dev, &iomem_resource, size);
- if (IS_ERR(res))
- return -ENOMEM;
+ if (adev->gmc.xgmi.connected_to_cpu) {
+ pgmap->range.start = adev->gmc.aper_base;
+ pgmap->range.end = adev->gmc.aper_base + adev->gmc.aper_size - 1;
+ pgmap->type = MEMORY_DEVICE_COHERENT;
+ } else {
+ res = devm_request_free_mem_region(adev->dev, &iomem_resource, size);
+ if (IS_ERR(res))
+ return -ENOMEM;
+ pgmap->range.start = res->start;
+ pgmap->range.end = res->end;
+ pgmap->type = MEMORY_DEVICE_PRIVATE;
+ }
- pgmap->type = MEMORY_DEVICE_PRIVATE;
pgmap->nr_range = 1;
- pgmap->range.start = res->start;
- pgmap->range.end = res->end;
pgmap->ops = &svm_migrate_pgmap_ops;
pgmap->owner = SVM_ADEV_PGMAP_OWNER(adev);
- pgmap->flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE;
-
+ pgmap->flags = 0;
/* Device manager releases device-specific resources, memory region and
* pgmap when driver disconnects from device.
*/
r = devm_memremap_pages(adev->dev, pgmap);
if (IS_ERR(r)) {
pr_err("failed to register HMM device memory\n");
-
/* Disable SVM support capability */
pgmap->type = 0;
- devm_release_mem_region(adev->dev, res->start, resource_size(res));
+ if (pgmap->type == MEMORY_DEVICE_PRIVATE)
+ devm_release_mem_region(adev->dev, res->start,
+ res->end - res->start + 1);
return PTR_ERR(r);
}
@@ -979,3 +988,4 @@ int svm_migrate_init(struct amdgpu_device *adev)
return 0;
}
+
diff --git a/drivers/iommu/tegra-smmu.c b/drivers/iommu/tegra-smmu.c
index e900e3c46903b..c5fa8b8673b6a 100644
--- a/drivers/iommu/tegra-smmu.c
+++ b/drivers/iommu/tegra-smmu.c
@@ -676,12 +676,12 @@ static struct page *as_get_pde_page(struct tegra_smmu_as *as,
* allocate page in a sleeping context if GFP flags permit. Hence
* spinlock needs to be unlocked and re-locked after allocation.
*/
- if (!(gfp & __GFP_ATOMIC))
+ if (gfp & __GFP_DIRECT_RECLAIM)
spin_unlock_irqrestore(&as->lock, *flags);
page = alloc_page(gfp | __GFP_DMA | __GFP_ZERO);
- if (!(gfp & __GFP_ATOMIC))
+ if (gfp & __GFP_DIRECT_RECLAIM)
spin_lock_irqsave(&as->lock, *flags);
/*
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 605017eb9349e..e297682e2c713 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -93,7 +93,7 @@ static int elf_core_dump(struct coredump_params *cprm);
#define ELF_CORE_EFLAGS 0
#endif
-#define ELF_PAGESTART(_v) ((_v) & ~(unsigned long)(ELF_MIN_ALIGN-1))
+#define ELF_PAGESTART(_v) ((_v) & ~(int)(ELF_MIN_ALIGN-1))
#define ELF_PAGEOFFSET(_v) ((_v) & (ELF_MIN_ALIGN-1))
#define ELF_PAGEALIGN(_v) (((_v) + ELF_MIN_ALIGN - 1) & ~(ELF_MIN_ALIGN - 1))
@@ -170,8 +170,8 @@ static int padzero(unsigned long elf_bss)
static int
create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec,
- unsigned long load_addr, unsigned long interp_load_addr,
- unsigned long e_entry)
+ unsigned long interp_load_addr,
+ unsigned long e_entry, unsigned long phdr_addr)
{
struct mm_struct *mm = current->mm;
unsigned long p = bprm->p;
@@ -257,7 +257,7 @@ create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec,
NEW_AUX_ENT(AT_HWCAP, ELF_HWCAP);
NEW_AUX_ENT(AT_PAGESZ, ELF_EXEC_PAGESIZE);
NEW_AUX_ENT(AT_CLKTCK, CLOCKS_PER_SEC);
- NEW_AUX_ENT(AT_PHDR, load_addr + exec->e_phoff);
+ NEW_AUX_ENT(AT_PHDR, phdr_addr);
NEW_AUX_ENT(AT_PHENT, sizeof(struct elf_phdr));
NEW_AUX_ENT(AT_PHNUM, exec->e_phnum);
NEW_AUX_ENT(AT_BASE, interp_load_addr);
@@ -399,22 +399,21 @@ static unsigned long elf_map(struct file *filep, unsigned long addr,
return(map_addr);
}
-static unsigned long total_mapping_size(const struct elf_phdr *cmds, int nr)
+static unsigned long total_mapping_size(const struct elf_phdr *phdr, int nr)
{
- int i, first_idx = -1, last_idx = -1;
+ elf_addr_t min_addr = -1;
+ elf_addr_t max_addr = 0;
+ bool pt_load = false;
+ int i;
for (i = 0; i < nr; i++) {
- if (cmds[i].p_type == PT_LOAD) {
- last_idx = i;
- if (first_idx == -1)
- first_idx = i;
+ if (phdr[i].p_type == PT_LOAD) {
+ min_addr = min(min_addr, ELF_PAGESTART(phdr[i].p_vaddr));
+ max_addr = max(max_addr, phdr[i].p_vaddr + phdr[i].p_memsz);
+ pt_load = true;
}
}
- if (first_idx == -1)
- return 0;
-
- return cmds[last_idx].p_vaddr + cmds[last_idx].p_memsz -
- ELF_PAGESTART(cmds[first_idx].p_vaddr);
+ return pt_load ? (max_addr - min_addr) : 0;
}
static int elf_read(struct file *file, void *buf, size_t len, loff_t pos)
@@ -823,8 +822,8 @@ static int parse_elf_properties(struct file *f, const struct elf_phdr *phdr,
static int load_elf_binary(struct linux_binprm *bprm)
{
struct file *interpreter = NULL; /* to shut gcc up */
- unsigned long load_addr = 0, load_bias = 0;
- int load_addr_set = 0;
+ unsigned long load_bias = 0, phdr_addr = 0;
+ int first_pt_load = 1;
unsigned long error;
struct elf_phdr *elf_ppnt, *elf_phdata, *interp_elf_phdata = NULL;
struct elf_phdr *elf_property_phdata = NULL;
@@ -1074,12 +1073,12 @@ out_free_interp:
vaddr = elf_ppnt->p_vaddr;
/*
- * The first time through the loop, load_addr_set is false:
+ * The first time through the loop, first_pt_load is true:
* layout will be calculated. Once set, use MAP_FIXED since
* we know we've already safely mapped the entire region with
* MAP_FIXED_NOREPLACE in the once-per-binary logic following.
*/
- if (load_addr_set) {
+ if (!first_pt_load) {
elf_flags |= MAP_FIXED;
} else if (elf_ex->e_type == ET_EXEC) {
/*
@@ -1117,7 +1116,7 @@ out_free_interp:
* without MAP_FIXED nor MAP_FIXED_NOREPLACE).
*/
alignment = maximum_alignment(elf_phdata, elf_ex->e_phnum);
- if (alignment > ELF_MIN_ALIGN) {
+ if (interpreter || alignment > ELF_MIN_ALIGN) {
load_bias = ELF_ET_DYN_BASE;
if (current->flags & PF_RANDOMIZE)
load_bias += arch_mmap_rnd();
@@ -1139,10 +1138,10 @@ out_free_interp:
/*
* Calculate the entire size of the ELF mapping (total_size).
- * (Note that load_addr_set is set to true later once the
+ * (Note that first_pt_load is set to false later once the
* initial mapping is performed.)
*/
- if (!load_addr_set) {
+ if (first_pt_load) {
total_size = total_mapping_size(elf_phdata,
elf_ex->e_phnum);
if (!total_size) {
@@ -1159,16 +1158,25 @@ out_free_interp:
goto out_free_dentry;
}
- if (!load_addr_set) {
- load_addr_set = 1;
- load_addr = (elf_ppnt->p_vaddr - elf_ppnt->p_offset);
+ if (first_pt_load) {
+ first_pt_load = 0;
if (elf_ex->e_type == ET_DYN) {
load_bias += error -
ELF_PAGESTART(load_bias + vaddr);
- load_addr += load_bias;
reloc_func_desc = load_bias;
}
}
+
+ /*
+ * Figure out which segment in the file contains the Program
+ * Header table, and map to the associated memory address.
+ */
+ if (elf_ppnt->p_offset <= elf_ex->e_phoff &&
+ elf_ex->e_phoff < elf_ppnt->p_offset + elf_ppnt->p_filesz) {
+ phdr_addr = elf_ex->e_phoff - elf_ppnt->p_offset +
+ elf_ppnt->p_vaddr;
+ }
+
k = elf_ppnt->p_vaddr;
if ((elf_ppnt->p_flags & PF_X) && k < start_code)
start_code = k;
@@ -1204,6 +1212,7 @@ out_free_interp:
}
e_entry = elf_ex->e_entry + load_bias;
+ phdr_addr += load_bias;
elf_bss += load_bias;
elf_brk += load_bias;
start_code += load_bias;
@@ -1267,8 +1276,8 @@ out_free_interp:
goto out;
#endif /* ARCH_HAS_SETUP_ADDITIONAL_PAGES */
- retval = create_elf_tables(bprm, elf_ex,
- load_addr, interp_load_addr, e_entry);
+ retval = create_elf_tables(bprm, elf_ex, interp_load_addr,
+ e_entry, phdr_addr);
if (retval < 0)
goto out;
diff --git a/fs/buffer.c b/fs/buffer.c
index a17c386a142c7..571785c80f688 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -956,10 +956,20 @@ grow_dev_page(struct block_device *bdev, sector_t block,
end_block = init_page_buffers(page, bdev,
(sector_t)index << sizebits,
size);
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ current->getblk_executed |= 0x01;
+#endif
goto done;
}
- if (!try_to_free_buffers(page))
+ if (!try_to_free_buffers(page)) {
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ current->getblk_executed |= 0x02;
+#endif
goto failed;
+ }
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ current->getblk_executed |= 0x04;
+#endif
}
/*
@@ -979,6 +989,9 @@ grow_dev_page(struct block_device *bdev, sector_t block,
spin_unlock(&inode->i_mapping->private_lock);
done:
ret = (block < end_block) ? 1 : -ENXIO;
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ current->getblk_executed |= 0x08;
+#endif
failed:
unlock_page(page);
put_page(page);
@@ -1030,6 +1043,12 @@ __getblk_slow(struct block_device *bdev, sector_t block,
return NULL;
}
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ current->getblk_stamp = jiffies;
+ current->getblk_executed = 0;
+ current->getblk_bh_count = 0;
+ current->getblk_bh_state = 0;
+#endif
for (;;) {
struct buffer_head *bh;
int ret;
@@ -1041,6 +1060,24 @@ __getblk_slow(struct block_device *bdev, sector_t block,
ret = grow_buffers(bdev, block, size, gfp);
if (ret < 0)
return NULL;
+
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ if (!time_after(jiffies, current->getblk_stamp + 3 * HZ))
+ continue;
+ printk(KERN_ERR "%s(%u): getblk(): executed=%x bh_count=%d bh_state=%lx bdev_super_blocksize=%ld size=%u bdev_super_blocksize_bits=%d bdev_inode_blkbits=%d\n",
+ current->comm, current->pid, current->getblk_executed,
+ current->getblk_bh_count, current->getblk_bh_state,
+ IS_ERR_OR_NULL(bdev->bd_super) ? -1L :
+ bdev->bd_super->s_blocksize, size,
+ IS_ERR_OR_NULL(bdev->bd_super) ? -1 :
+ bdev->bd_super->s_blocksize_bits,
+ IS_ERR_OR_NULL(bdev->bd_inode) ? -1 :
+ bdev->bd_inode->i_blkbits);
+ current->getblk_executed = 0;
+ current->getblk_bh_count = 0;
+ current->getblk_bh_state = 0;
+ current->getblk_stamp = jiffies;
+#endif
}
}
@@ -3190,6 +3227,11 @@ EXPORT_SYMBOL(sync_dirty_buffer);
*/
static inline int buffer_busy(struct buffer_head *bh)
{
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ current->getblk_executed |= 0x80;
+ current->getblk_bh_count = atomic_read(&bh->b_count);
+ current->getblk_bh_state = bh->b_state;
+#endif
return atomic_read(&bh->b_count) |
(bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock)));
}
@@ -3228,11 +3270,18 @@ int try_to_free_buffers(struct page *page)
int ret = 0;
BUG_ON(!PageLocked(page));
- if (PageWriteback(page))
+ if (PageWriteback(page)) {
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ current->getblk_executed |= 0x10;
+#endif
return 0;
+ }
if (mapping == NULL) { /* can this still happen? */
ret = drop_buffers(page, &buffers_to_free);
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ current->getblk_executed |= 0x20;
+#endif
goto out;
}
@@ -3256,6 +3305,9 @@ int try_to_free_buffers(struct page *page)
if (ret)
cancel_dirty_page(page);
spin_unlock(&mapping->private_lock);
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ current->getblk_executed |= 0x40;
+#endif
out:
if (buffers_to_free) {
struct buffer_head *bh = buffers_to_free;
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index c98e5238a1b6a..9147667f8cd55 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -57,11 +57,6 @@
* accounting is preserved.
*/
-#define CONGESTION_ON_THRESH(congestion_kb) (congestion_kb >> (PAGE_SHIFT-10))
-#define CONGESTION_OFF_THRESH(congestion_kb) \
- (CONGESTION_ON_THRESH(congestion_kb) - \
- (CONGESTION_ON_THRESH(congestion_kb) >> 2))
-
static int ceph_netfs_check_write_begin(struct file *file, loff_t pos, unsigned int len,
struct folio *folio, void **_fsdata);
@@ -561,10 +556,6 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
dout("writepage %p page %p index %lu on %llu~%llu snapc %p seq %lld\n",
inode, page, page->index, page_off, len, snapc, snapc->seq);
- if (atomic_long_inc_return(&fsc->writeback_count) >
- CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb))
- set_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC);
-
req = ceph_osdc_new_request(osdc, &ci->i_layout, ceph_vino(inode), page_off, &len, 0, 1,
CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE, snapc,
ceph_wbc.truncate_seq, ceph_wbc.truncate_size,
@@ -621,10 +612,6 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
ceph_put_snap_context(snapc); /* page's reference */
- if (atomic_long_dec_return(&fsc->writeback_count) <
- CONGESTION_OFF_THRESH(fsc->mount_options->congestion_kb))
- clear_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC);
-
return err;
}
@@ -704,12 +691,6 @@ static void writepages_finish(struct ceph_osd_request *req)
BUG_ON(!page);
WARN_ON(!PageUptodate(page));
- if (atomic_long_dec_return(&fsc->writeback_count) <
- CONGESTION_OFF_THRESH(
- fsc->mount_options->congestion_kb))
- clear_bdi_congested(inode_to_bdi(inode),
- BLK_RW_ASYNC);
-
ceph_put_snap_context(detach_page_private(page));
end_page_writeback(page);
dout("unlocking %p\n", page);
@@ -952,14 +933,6 @@ get_more_pages:
dout("%p will write page %p idx %lu\n",
inode, page, page->index);
- if (atomic_long_inc_return(&fsc->writeback_count) >
- CONGESTION_ON_THRESH(
- fsc->mount_options->congestion_kb)) {
- set_bdi_congested(inode_to_bdi(inode),
- BLK_RW_ASYNC);
- }
-
-
pages[locked_pages++] = page;
pvec.pages[i] = NULL;
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index bf79f369aec68..b2f38af9fca83 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -801,8 +801,6 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
fsc->filp_gen = 1;
fsc->have_copy_from2 = true;
- atomic_long_set(&fsc->writeback_count, 0);
-
err = -ENOMEM;
/*
* The number of concurrent works can be high but they don't need
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 67f145e1ae7a3..fc58adf1d36ae 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -120,8 +120,6 @@ struct ceph_fs_client {
struct ceph_mds_client *mdsc;
- atomic_long_t writeback_count;
-
struct workqueue_struct *inode_wq;
struct workqueue_struct *cap_wq;
diff --git a/fs/coredump.c b/fs/coredump.c
index 1c060c0a2d72f..b73817712dd25 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -42,6 +42,7 @@
#include <linux/path.h>
#include <linux/timekeeping.h>
#include <linux/sysctl.h>
+#include <linux/elf.h>
#include <linux/uaccess.h>
#include <asm/mmu_context.h>
@@ -980,6 +981,8 @@ static bool always_dump_vma(struct vm_area_struct *vma)
return false;
}
+#define DUMP_SIZE_MAYBE_ELFHDR_PLACEHOLDER 1
+
/*
* Decide how much of @vma's contents should be included in a core dump.
*/
@@ -1039,9 +1042,20 @@ static unsigned long vma_dump_size(struct vm_area_struct *vma,
* dump the first page to aid in determining what was mapped here.
*/
if (FILTER(ELF_HEADERS) &&
- vma->vm_pgoff == 0 && (vma->vm_flags & VM_READ) &&
- (READ_ONCE(file_inode(vma->vm_file)->i_mode) & 0111) != 0)
- return PAGE_SIZE;
+ vma->vm_pgoff == 0 && (vma->vm_flags & VM_READ)) {
+ if ((READ_ONCE(file_inode(vma->vm_file)->i_mode) & 0111) != 0)
+ return PAGE_SIZE;
+
+ /*
+ * ELF libraries aren't always executable.
+ * We'll want to check whether the mapping starts with the ELF
+ * magic, but not now - we're holding the mmap lock,
+ * so copy_from_user() doesn't work here.
+ * Use a placeholder instead, and fix it up later in
+ * dump_vma_snapshot().
+ */
+ return DUMP_SIZE_MAYBE_ELFHDR_PLACEHOLDER;
+ }
#undef FILTER
@@ -1116,8 +1130,6 @@ int dump_vma_snapshot(struct coredump_params *cprm, int *vma_count,
m->end = vma->vm_end;
m->flags = vma->vm_flags;
m->dump_size = vma_dump_size(vma, cprm->mm_flags);
-
- vma_data_size += m->dump_size;
}
mmap_write_unlock(mm);
@@ -1127,6 +1139,23 @@ int dump_vma_snapshot(struct coredump_params *cprm, int *vma_count,
return -EFAULT;
}
+ for (i = 0; i < *vma_count; i++) {
+ struct core_vma_metadata *m = (*vma_meta) + i;
+
+ if (m->dump_size == DUMP_SIZE_MAYBE_ELFHDR_PLACEHOLDER) {
+ char elfmag[SELFMAG];
+
+ if (copy_from_user(elfmag, (void __user *)m->start, SELFMAG) ||
+ memcmp(elfmag, ELFMAG, SELFMAG) != 0) {
+ m->dump_size = 0;
+ } else {
+ m->dump_size = PAGE_SIZE;
+ }
+ }
+
+ vma_data_size += m->dump_size;
+ }
+
*vma_data_size_ptr = vma_data_size;
return 0;
}
diff --git a/fs/exec.c b/fs/exec.c
index 79f2c9483302d..40b1008fb0f79 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -495,8 +495,14 @@ static int bprm_stack_limits(struct linux_binprm *bprm)
* the stack. They aren't stored until much later when we can't
* signal to the parent that the child has run out of stack space.
* Instead, calculate it here so it's possible to fail gracefully.
+ *
+ * In the case of argc = 0, make sure there is space for adding a
+ * empty string (which will bump argc to 1), to ensure confused
+ * userspace programs don't start processing from argv[1], thinking
+ * argc can never be 0, to keep them from walking envp by accident.
+ * See do_execveat_common().
*/
- ptr_size = (bprm->argc + bprm->envc) * sizeof(void *);
+ ptr_size = (max(bprm->argc, 1) + bprm->envc) * sizeof(void *);
if (limit <= ptr_size)
return -E2BIG;
limit -= ptr_size;
@@ -1897,6 +1903,9 @@ static int do_execveat_common(int fd, struct filename *filename,
}
retval = count(argv, MAX_ARG_STRINGS);
+ if (retval == 0)
+ pr_warn_once("process '%s' launched '%s' with NULL argv: empty string added\n",
+ current->comm, bprm->filename);
if (retval < 0)
goto out_free;
bprm->argc = retval;
@@ -1923,6 +1932,19 @@ static int do_execveat_common(int fd, struct filename *filename,
if (retval < 0)
goto out_free;
+ /*
+ * When argv is empty, add an empty string ("") as argv[0] to
+ * ensure confused userspace programs that start processing
+ * from argv[1] won't end up walking envp. See also
+ * bprm_stack_limits().
+ */
+ if (bprm->argc == 0) {
+ retval = copy_string_kernel("", bprm);
+ if (retval < 0)
+ goto out_free;
+ bprm->argc = 1;
+ }
+
retval = bprm_execve(bprm, fd, filename, flags);
out_free:
free_bprm(bprm);
@@ -1951,6 +1973,8 @@ int kernel_execve(const char *kernel_filename,
}
retval = count_strings_kernel(argv);
+ if (WARN_ON_ONCE(retval == 0))
+ retval = -EINVAL;
if (retval < 0)
goto out_free;
bprm->argc = retval;
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index df14e750e9fe3..998dd2ac80089 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -170,11 +170,6 @@ static void ext2_preread_inode(struct inode *inode)
unsigned long offset;
unsigned long block;
struct ext2_group_desc * gdp;
- struct backing_dev_info *bdi;
-
- bdi = inode_to_bdi(inode);
- if (bdi_rw_congested(bdi))
- return;
block_group = (inode->i_ino - 1) / EXT2_INODES_PER_GROUP(inode->i_sb);
gdp = ext2_get_group_desc(inode->i_sb, block_group, NULL);
diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c
index 67bac2792e571..54481b1d569a0 100644
--- a/fs/f2fs/compress.c
+++ b/fs/f2fs/compress.c
@@ -1505,9 +1505,9 @@ continue_unlock:
if (IS_NOQUOTA(cc->inode))
return 0;
ret = 0;
- cond_resched();
- congestion_wait(BLK_RW_ASYNC,
- DEFAULT_IO_TIMEOUT);
+ /* Wait until we can get the lock, then try again. */
+ f2fs_lock_op(F2FS_I_SB(cc->inode));
+ f2fs_unlock_op(F2FS_I_SB(cc->inode));
goto retry_write;
}
return ret;
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 55263dfa3504c..45037353fee34 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -3052,9 +3052,12 @@ result:
} else if (ret == -EAGAIN) {
ret = 0;
if (wbc->sync_mode == WB_SYNC_ALL) {
- cond_resched();
- congestion_wait(BLK_RW_ASYNC,
- DEFAULT_IO_TIMEOUT);
+ /* Wait until we can get the
+ * lock, then try again.
+ */
+ f2fs_lock_op(F2FS_I_SB(mapping->host));
+ f2fs_unlock_op(F2FS_I_SB(mapping->host));
+
goto retry_write;
}
goto next;
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 56211e201d51b..78acaac9433d5 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -313,8 +313,8 @@ next:
skip:
iput(inode);
}
- congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT);
- cond_resched();
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ io_schedule_timeout(DEFAULT_IO_TIMEOUT);
if (gc_failure) {
if (++looped >= count)
return;
@@ -802,9 +802,10 @@ int f2fs_flush_device_cache(struct f2fs_sb_info *sbi)
do {
ret = __submit_flush_wait(sbi, FDEV(i).bdev);
- if (ret)
- congestion_wait(BLK_RW_ASYNC,
- DEFAULT_IO_TIMEOUT);
+ if (ret) {
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ io_schedule_timeout(DEFAULT_IO_TIMEOUT);
+ }
} while (ret && --count);
if (ret) {
@@ -3137,7 +3138,8 @@ next:
blk_finish_plug(&plug);
mutex_unlock(&dcc->cmd_lock);
trimmed += __wait_all_discard_cmd(sbi, NULL);
- congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT);
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ io_schedule_timeout(DEFAULT_IO_TIMEOUT);
goto next;
}
skip:
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index b9c22648a6365..56b78ae38a3d5 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -2136,8 +2136,8 @@ static void f2fs_enable_checkpoint(struct f2fs_sb_info *sbi)
/* we should flush all the data to keep data consistency */
do {
sync_inodes_sb(sbi->sb);
- cond_resched();
- congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT);
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ io_schedule_timeout(DEFAULT_IO_TIMEOUT);
} while (get_pages(sbi, F2FS_DIRTY_DATA) && retry--);
if (unlikely(retry < 0))
@@ -2505,8 +2505,8 @@ retry:
&page, &fsdata);
if (unlikely(err)) {
if (err == -ENOMEM) {
- congestion_wait(BLK_RW_ASYNC,
- DEFAULT_IO_TIMEOUT);
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ io_schedule_timeout(DEFAULT_IO_TIMEOUT);
goto retry;
}
set_sbi_flag(F2FS_SB(sb), SBI_QUOTA_NEED_REPAIR);
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 33d54c9fbefc0..01843bc5fef18 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -894,43 +894,6 @@ void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page,
EXPORT_SYMBOL_GPL(wbc_account_cgroup_owner);
/**
- * inode_congested - test whether an inode is congested
- * @inode: inode to test for congestion (may be NULL)
- * @cong_bits: mask of WB_[a]sync_congested bits to test
- *
- * Tests whether @inode is congested. @cong_bits is the mask of congestion
- * bits to test and the return value is the mask of set bits.
- *
- * If cgroup writeback is enabled for @inode, the congestion state is
- * determined by whether the cgwb (cgroup bdi_writeback) for the blkcg
- * associated with @inode is congested; otherwise, the root wb's congestion
- * state is used.
- *
- * @inode is allowed to be NULL as this function is often called on
- * mapping->host which is NULL for the swapper space.
- */
-int inode_congested(struct inode *inode, int cong_bits)
-{
- /*
- * Once set, ->i_wb never becomes NULL while the inode is alive.
- * Start transaction iff ->i_wb is visible.
- */
- if (inode && inode_to_wb_is_valid(inode)) {
- struct bdi_writeback *wb;
- struct wb_lock_cookie lock_cookie = {};
- bool congested;
-
- wb = unlocked_inode_to_wb_begin(inode, &lock_cookie);
- congested = wb_congested(wb, cong_bits);
- unlocked_inode_to_wb_end(inode, &lock_cookie);
- return congested;
- }
-
- return wb_congested(&inode_to_bdi(inode)->wb, cong_bits);
-}
-EXPORT_SYMBOL_GPL(inode_congested);
-
-/**
* wb_split_bdi_pages - split nr_pages to write according to bandwidth
* @wb: target bdi_writeback to split @nr_pages to
* @nr_pages: number of pages to write for the whole bdi
diff --git a/fs/fuse/control.c b/fs/fuse/control.c
index 000d2e5627e99..7cede9a3bc962 100644
--- a/fs/fuse/control.c
+++ b/fs/fuse/control.c
@@ -164,7 +164,6 @@ static ssize_t fuse_conn_congestion_threshold_write(struct file *file,
{
unsigned val;
struct fuse_conn *fc;
- struct fuse_mount *fm;
ssize_t ret;
ret = fuse_conn_limit_write(file, buf, count, ppos, &val,
@@ -178,22 +177,6 @@ static ssize_t fuse_conn_congestion_threshold_write(struct file *file,
down_read(&fc->killsb);
spin_lock(&fc->bg_lock);
fc->congestion_threshold = val;
-
- /*
- * Get any fuse_mount belonging to this fuse_conn; s_bdi is
- * shared between all of them
- */
-
- if (!list_empty(&fc->mounts)) {
- fm = list_first_entry(&fc->mounts, struct fuse_mount, fc_entry);
- if (fc->num_background < fc->congestion_threshold) {
- clear_bdi_congested(fm->sb->s_bdi, BLK_RW_SYNC);
- clear_bdi_congested(fm->sb->s_bdi, BLK_RW_ASYNC);
- } else {
- set_bdi_congested(fm->sb->s_bdi, BLK_RW_SYNC);
- set_bdi_congested(fm->sb->s_bdi, BLK_RW_ASYNC);
- }
- }
spin_unlock(&fc->bg_lock);
up_read(&fc->killsb);
fuse_conn_put(fc);
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index cd54a529460da..e1b4a846c90d1 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -315,10 +315,6 @@ void fuse_request_end(struct fuse_req *req)
wake_up(&fc->blocked_waitq);
}
- if (fc->num_background == fc->congestion_threshold && fm->sb) {
- clear_bdi_congested(fm->sb->s_bdi, BLK_RW_SYNC);
- clear_bdi_congested(fm->sb->s_bdi, BLK_RW_ASYNC);
- }
fc->num_background--;
fc->active_background--;
flush_bg_queue(fc);
@@ -540,10 +536,6 @@ static bool fuse_request_queue_background(struct fuse_req *req)
fc->num_background++;
if (fc->num_background == fc->max_background)
fc->blocked = 1;
- if (fc->num_background == fc->congestion_threshold && fm->sb) {
- set_bdi_congested(fm->sb->s_bdi, BLK_RW_SYNC);
- set_bdi_congested(fm->sb->s_bdi, BLK_RW_ASYNC);
- }
list_add_tail(&req->list, &fc->bg_queue);
flush_bg_queue(fc);
queued = true;
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index a71f1cf894b9f..d4bd94234ef73 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -447,7 +447,8 @@ static const struct address_space_operations minix_aops = {
.writepage = minix_writepage,
.write_begin = minix_write_begin,
.write_end = generic_write_end,
- .bmap = minix_bmap
+ .bmap = minix_bmap,
+ .direct_IO = noop_direct_IO
};
static const struct inode_operations minix_symlink_inode_operations = {
diff --git a/fs/namei.c b/fs/namei.c
index 3f1829b3ab5b7..e596aabd6dc5c 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1031,7 +1031,7 @@ static struct ctl_table namei_sysctls[] = {
.procname = "protected_symlinks",
.data = &sysctl_protected_symlinks,
.maxlen = sizeof(int),
- .mode = 0600,
+ .mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE,
@@ -1040,7 +1040,7 @@ static struct ctl_table namei_sysctls[] = {
.procname = "protected_hardlinks",
.data = &sysctl_protected_hardlinks,
.maxlen = sizeof(int),
- .mode = 0600,
+ .mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE,
@@ -1049,7 +1049,7 @@ static struct ctl_table namei_sysctls[] = {
.procname = "protected_fifos",
.data = &sysctl_protected_fifos,
.maxlen = sizeof(int),
- .mode = 0600,
+ .mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_TWO,
@@ -1058,7 +1058,7 @@ static struct ctl_table namei_sysctls[] = {
.procname = "protected_regular",
.data = &sysctl_protected_regular,
.maxlen = sizeof(int),
- .mode = 0600,
+ .mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_TWO,
diff --git a/fs/namespace.c b/fs/namespace.c
index c6feb92209a6f..fec0f79aa2eb7 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2583,6 +2583,7 @@ static void mnt_warn_timestamp_expiry(struct path *mountpoint, struct vfsmount *
struct super_block *sb = mnt->mnt_sb;
if (!__mnt_is_readonly(mnt) &&
+ (!(sb->s_iflags & SB_I_TS_EXPIRY_WARNED)) &&
(ktime_get_real_seconds() + TIME_UPTIME_SEC_MAX > sb->s_time_max)) {
char *buf = (char *)__get_free_page(GFP_KERNEL);
char *mntpath = buf ? d_path(mountpoint, buf, PAGE_SIZE) : ERR_PTR(-ENOMEM);
@@ -2597,6 +2598,7 @@ static void mnt_warn_timestamp_expiry(struct path *mountpoint, struct vfsmount *
tm.tm_year+1900, (unsigned long long)sb->s_time_max);
free_page((unsigned long)buf);
+ sb->s_iflags |= SB_I_TS_EXPIRY_WARNED;
}
}
diff --git a/fs/nfs/sysctl.c b/fs/nfs/sysctl.c
index 7aea195ddb353..18f3ff77fd0c2 100644
--- a/fs/nfs/sysctl.c
+++ b/fs/nfs/sysctl.c
@@ -22,13 +22,6 @@ static struct ctl_table nfs_cb_sysctls[] = {
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
- {
- .procname = "nfs_congestion_kb",
- .data = &nfs_congestion_kb,
- .maxlen = sizeof(nfs_congestion_kb),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
{ }
};
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 987a187bd39aa..1c22ea6f23c3a 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -397,33 +397,8 @@ static int wb_priority(struct writeback_control *wbc)
return ret;
}
-/*
- * NFS congestion control
- */
-
-int nfs_congestion_kb;
-
-#define NFS_CONGESTION_ON_THRESH (nfs_congestion_kb >> (PAGE_SHIFT-10))
-#define NFS_CONGESTION_OFF_THRESH \
- (NFS_CONGESTION_ON_THRESH - (NFS_CONGESTION_ON_THRESH >> 2))
-
-static void nfs_set_page_writeback(struct page *page)
-{
- struct inode *inode = page_file_mapping(page)->host;
- struct nfs_server *nfss = NFS_SERVER(inode);
- int ret = test_set_page_writeback(page);
-
- WARN_ON_ONCE(ret != 0);
-
- if (atomic_long_inc_return(&nfss->writeback) >
- NFS_CONGESTION_ON_THRESH)
- set_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC);
-}
-
static void nfs_end_page_writeback(struct nfs_page *req)
{
- struct inode *inode = page_file_mapping(req->wb_page)->host;
- struct nfs_server *nfss = NFS_SERVER(inode);
bool is_done;
is_done = nfs_page_group_sync_on_bit(req, PG_WB_END);
@@ -432,8 +407,6 @@ static void nfs_end_page_writeback(struct nfs_page *req)
return;
end_page_writeback(req->wb_page);
- if (atomic_long_dec_return(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH)
- clear_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC);
}
/*
@@ -617,7 +590,7 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
if (IS_ERR(req))
goto out;
- nfs_set_page_writeback(page);
+ set_page_writeback(page);
WARN_ON_ONCE(test_bit(PG_CLEAN, &req->wb_flags));
/* If there is a fatal error that covers this write, just exit */
@@ -1850,7 +1823,6 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data)
struct nfs_page *req;
int status = data->task.tk_status;
struct nfs_commit_info cinfo;
- struct nfs_server *nfss;
while (!list_empty(&data->pages)) {
req = nfs_list_entry(data->pages.next);
@@ -1891,9 +1863,6 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data)
/* Latency breaker */
cond_resched();
}
- nfss = NFS_SERVER(data->inode);
- if (atomic_long_read(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH)
- clear_bdi_congested(inode_to_bdi(data->inode), BLK_RW_ASYNC);
nfs_init_cinfo(&cinfo, data->inode, data->dreq);
nfs_commit_end(cinfo.mds);
@@ -2162,26 +2131,6 @@ int __init nfs_init_writepagecache(void)
if (nfs_commit_mempool == NULL)
goto out_destroy_commit_cache;
- /*
- * NFS congestion size, scale with available memory.
- *
- * 64MB: 8192k
- * 128MB: 11585k
- * 256MB: 16384k
- * 512MB: 23170k
- * 1GB: 32768k
- * 2GB: 46340k
- * 4GB: 65536k
- * 8GB: 92681k
- * 16GB: 131072k
- *
- * This allows larger machines to have larger/more transfers.
- * Limit the default to 256M
- */
- nfs_congestion_kb = (16*int_sqrt(totalram_pages())) << (PAGE_SHIFT-10);
- if (nfs_congestion_kb > 256*1024)
- nfs_congestion_kb = 256*1024;
-
return 0;
out_destroy_commit_cache:
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index 4f71faacd8253..9e5dd6324ea1e 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -343,17 +343,6 @@ static int nilfs_segbuf_submit_bio(struct nilfs_segment_buffer *segbuf,
struct bio *bio = wi->bio;
int err;
- if (segbuf->sb_nbio > 0 &&
- bdi_write_congested(segbuf->sb_super->s_bdi)) {
- wait_for_completion(&segbuf->sb_bio_event);
- segbuf->sb_nbio--;
- if (unlikely(atomic_read(&segbuf->sb_err))) {
- bio_put(bio);
- err = -EIO;
- goto failed;
- }
- }
-
bio->bi_end_io = nilfs_end_bio_write;
bio->bi_private = segbuf;
bio_set_op_attrs(bio, mode, mode_flags);
@@ -366,7 +355,6 @@ static int nilfs_segbuf_submit_bio(struct nilfs_segment_buffer *segbuf,
wi->start = wi->end;
return 0;
- failed:
wi->bio = NULL;
return err;
}
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index 4474adb393ca8..517b71c73aa96 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -1881,6 +1881,10 @@ int ntfs_read_inode_mount(struct inode *vi)
}
/* Now allocate memory for the attribute list. */
ni->attr_list_size = (u32)ntfs_attr_size(a);
+ if (!ni->attr_list_size) {
+ ntfs_error(sb, "Attr_list_size is zero");
+ goto put_err_out;
+ }
ni->attr_list = ntfs_malloc_nofs(ni->attr_list_size);
if (!ni->attr_list) {
ntfs_error(sb, "Not enough memory to allocate buffer "
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index fc5f780fa2355..24321c44cd42e 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -540,15 +540,12 @@ int ocfs2_add_inode_data(struct ocfs2_super *osb,
struct ocfs2_alloc_context *meta_ac,
enum ocfs2_alloc_restarted *reason_ret)
{
- int ret;
struct ocfs2_extent_tree et;
ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), fe_bh);
- ret = ocfs2_add_clusters_in_btree(handle, &et, logical_offset,
- clusters_to_add, mark_unwritten,
- data_ac, meta_ac, reason_ret);
-
- return ret;
+ return ocfs2_add_clusters_in_btree(handle, &et, logical_offset,
+ clusters_to_add, mark_unwritten,
+ data_ac, meta_ac, reason_ret);
}
static int ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 2c46ff6ba4ea2..0939186f010e3 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -453,8 +453,12 @@ roll_back:
leave:
if (status < 0 && did_quota_inode)
dquot_free_inode(inode);
- if (handle)
+ if (handle) {
+ if (status < 0 && new_fe_bh != NULL)
+ ocfs2_set_links_count((struct ocfs2_dinode *)
+ new_fe_bh->b_data, 0);
ocfs2_commit_trans(osb, handle);
+ }
ocfs2_inode_unlock(dir, 1);
if (did_block_signals)
@@ -598,6 +602,8 @@ static int __ocfs2_mknod_locked(struct inode *dir,
leave:
if (status < 0) {
if (*new_fe_bh) {
+ if (fe)
+ ocfs2_set_links_count(fe, 0);
brelse(*new_fe_bh);
*new_fe_bh = NULL;
}
@@ -634,7 +640,8 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
status = __ocfs2_mknod_locked(dir, inode, dev, new_fe_bh,
parent_fe_bh, handle, inode_ac,
fe_blkno, suballoc_loc, suballoc_bit);
- if (status < 0) {
+ if (status < 0 && !(OCFS2_I(inode)->ip_inode_lockres.l_flags &
+ OCFS2_LOCK_INITIALIZED)) {
u64 bg_blkno = ocfs2_which_suballoc_group(fe_blkno, suballoc_bit);
int tmp = ocfs2_free_suballoc_bits(handle, inode_ac->ac_inode,
inode_ac->ac_bh, suballoc_bit, bg_blkno, 1);
@@ -2027,8 +2034,12 @@ bail:
ocfs2_clusters_to_bytes(osb->sb, 1));
if (status < 0 && did_quota_inode)
dquot_free_inode(inode);
- if (handle)
+ if (handle) {
+ if (status < 0 && new_fe_bh != NULL)
+ ocfs2_set_links_count((struct ocfs2_dinode *)
+ new_fe_bh->b_data, 0);
ocfs2_commit_trans(osb, handle);
+ }
ocfs2_inode_unlock(dir, 1);
if (did_block_signals)
@@ -2489,6 +2500,7 @@ out:
}
int ocfs2_create_inode_in_orphan(struct inode *dir,
+ struct buffer_head **dir_bh,
int mode,
struct inode **new_inode)
{
@@ -2597,13 +2609,16 @@ leave:
brelse(new_di_bh);
- if (!status)
- *new_inode = inode;
-
ocfs2_free_dir_lookup_result(&orphan_insert);
- ocfs2_inode_unlock(dir, 1);
- brelse(parent_di_bh);
+ if (!status) {
+ *new_inode = inode;
+ *dir_bh = parent_di_bh;
+ } else {
+ ocfs2_inode_unlock(dir, 1);
+ brelse(parent_di_bh);
+ }
+
return status;
}
@@ -2760,11 +2775,11 @@ bail:
}
int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
+ struct buffer_head *dir_bh,
struct inode *inode,
struct dentry *dentry)
{
int status = 0;
- struct buffer_head *parent_di_bh = NULL;
handle_t *handle = NULL;
struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
struct ocfs2_dinode *dir_di, *di;
@@ -2778,14 +2793,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
(unsigned long long)OCFS2_I(dir)->ip_blkno,
(unsigned long long)OCFS2_I(inode)->ip_blkno);
- status = ocfs2_inode_lock(dir, &parent_di_bh, 1);
- if (status < 0) {
- if (status != -ENOENT)
- mlog_errno(status);
- return status;
- }
-
- dir_di = (struct ocfs2_dinode *) parent_di_bh->b_data;
+ dir_di = (struct ocfs2_dinode *) dir_bh->b_data;
if (!dir_di->i_links_count) {
/* can't make a file in a deleted directory. */
status = -ENOENT;
@@ -2798,7 +2806,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
goto leave;
/* get a spot inside the dir. */
- status = ocfs2_prepare_dir_for_insert(osb, dir, parent_di_bh,
+ status = ocfs2_prepare_dir_for_insert(osb, dir, dir_bh,
dentry->d_name.name,
dentry->d_name.len, &lookup);
if (status < 0) {
@@ -2862,7 +2870,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
ocfs2_journal_dirty(handle, di_bh);
status = ocfs2_add_entry(handle, dentry, inode,
- OCFS2_I(inode)->ip_blkno, parent_di_bh,
+ OCFS2_I(inode)->ip_blkno, dir_bh,
&lookup);
if (status < 0) {
mlog_errno(status);
@@ -2886,10 +2894,7 @@ orphan_unlock:
iput(orphan_dir_inode);
leave:
- ocfs2_inode_unlock(dir, 1);
-
brelse(di_bh);
- brelse(parent_di_bh);
brelse(orphan_dir_bh);
ocfs2_free_dir_lookup_result(&lookup);
diff --git a/fs/ocfs2/namei.h b/fs/ocfs2/namei.h
index 9cc891eb874e0..03a2c526e2c1b 100644
--- a/fs/ocfs2/namei.h
+++ b/fs/ocfs2/namei.h
@@ -24,6 +24,7 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
struct buffer_head *orphan_dir_bh,
bool dio);
int ocfs2_create_inode_in_orphan(struct inode *dir,
+ struct buffer_head **dir_bh,
int mode,
struct inode **new_inode);
int ocfs2_add_inode_to_orphan(struct ocfs2_super *osb,
@@ -32,6 +33,7 @@ int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb,
struct inode *inode, struct buffer_head *di_bh,
int update_isize, loff_t end);
int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
+ struct buffer_head *dir_bh,
struct inode *new_inode,
struct dentry *new_dentry);
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 7f6355cbb5875..a9a0c7c37e8ed 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -4250,7 +4250,7 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
{
int error, had_lock;
struct inode *inode = d_inode(old_dentry);
- struct buffer_head *old_bh = NULL;
+ struct buffer_head *old_bh = NULL, *dir_bh = NULL;
struct inode *new_orphan_inode = NULL;
struct ocfs2_lock_holder oh;
@@ -4258,7 +4258,7 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
return -EOPNOTSUPP;
- error = ocfs2_create_inode_in_orphan(dir, inode->i_mode,
+ error = ocfs2_create_inode_in_orphan(dir, &dir_bh, inode->i_mode,
&new_orphan_inode);
if (error) {
mlog_errno(error);
@@ -4304,13 +4304,15 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
/* If the security isn't preserved, we need to re-initialize them. */
if (!preserve) {
- error = ocfs2_init_security_and_acl(dir, new_orphan_inode,
+ error = ocfs2_init_security_and_acl(dir, dir_bh,
+ new_orphan_inode,
&new_dentry->d_name);
if (error)
mlog_errno(error);
}
if (!error) {
- error = ocfs2_mv_orphaned_inode_to_new(dir, new_orphan_inode,
+ error = ocfs2_mv_orphaned_inode_to_new(dir, dir_bh,
+ new_orphan_inode,
new_dentry);
if (error)
mlog_errno(error);
@@ -4328,6 +4330,11 @@ out:
iput(new_orphan_inode);
}
+ if (dir_bh) {
+ ocfs2_inode_unlock(dir, 1);
+ brelse(dir_bh);
+ }
+
return error;
}
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index 85a47621e0c07..a75e2b7d67f56 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -683,28 +683,22 @@ static int user_dlm_lock(struct ocfs2_cluster_connection *conn,
void *name,
unsigned int namelen)
{
- int ret;
-
if (!lksb->lksb_fsdlm.sb_lvbptr)
lksb->lksb_fsdlm.sb_lvbptr = (char *)lksb +
sizeof(struct dlm_lksb);
- ret = dlm_lock(conn->cc_lockspace, mode, &lksb->lksb_fsdlm,
- flags|DLM_LKF_NODLCKWT, name, namelen, 0,
- fsdlm_lock_ast_wrapper, lksb,
- fsdlm_blocking_ast_wrapper);
- return ret;
+ return dlm_lock(conn->cc_lockspace, mode, &lksb->lksb_fsdlm,
+ flags|DLM_LKF_NODLCKWT, name, namelen, 0,
+ fsdlm_lock_ast_wrapper, lksb,
+ fsdlm_blocking_ast_wrapper);
}
static int user_dlm_unlock(struct ocfs2_cluster_connection *conn,
struct ocfs2_dlm_lksb *lksb,
u32 flags)
{
- int ret;
-
- ret = dlm_unlock(conn->cc_lockspace, lksb->lksb_fsdlm.sb_lkid,
- flags, &lksb->lksb_fsdlm, lksb);
- return ret;
+ return dlm_unlock(conn->cc_lockspace, lksb->lksb_fsdlm.sb_lkid,
+ flags, &lksb->lksb_fsdlm, lksb);
}
static int user_dlm_lock_status(struct ocfs2_dlm_lksb *lksb)
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index dd784eb0cd7c4..3f23e3a5018ce 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -7203,16 +7203,13 @@ out:
/*
* Initialize security and acl for a already created inode.
* Used for reflink a non-preserve-security file.
- *
- * It uses common api like ocfs2_xattr_set, so the caller
- * must not hold any lock expect i_mutex.
*/
int ocfs2_init_security_and_acl(struct inode *dir,
+ struct buffer_head *dir_bh,
struct inode *inode,
const struct qstr *qstr)
{
int ret = 0;
- struct buffer_head *dir_bh = NULL;
ret = ocfs2_init_security_get(inode, dir, qstr, NULL);
if (ret) {
@@ -7220,17 +7217,10 @@ int ocfs2_init_security_and_acl(struct inode *dir,
goto leave;
}
- ret = ocfs2_inode_lock(dir, &dir_bh, 0);
- if (ret) {
- mlog_errno(ret);
- goto leave;
- }
ret = ocfs2_init_acl(NULL, inode, dir, NULL, dir_bh, NULL, NULL);
if (ret)
mlog_errno(ret);
- ocfs2_inode_unlock(dir, 0);
- brelse(dir_bh);
leave:
return ret;
}
diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h
index 00308b57f64f1..b27fd8ba00196 100644
--- a/fs/ocfs2/xattr.h
+++ b/fs/ocfs2/xattr.h
@@ -83,6 +83,7 @@ int ocfs2_reflink_xattrs(struct inode *old_inode,
struct buffer_head *new_bh,
bool preserve_security);
int ocfs2_init_security_and_acl(struct inode *dir,
+ struct buffer_head *dir_bh,
struct inode *inode,
const struct qstr *qstr);
#endif /* OCFS2_XATTR_H */
diff --git a/fs/pipe.c b/fs/pipe.c
index cc28623a67b61..71946832e33f9 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -606,7 +606,7 @@ out:
static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
struct pipe_inode_info *pipe = filp->private_data;
- int count, head, tail, mask;
+ unsigned int count, head, tail, mask;
switch (cmd) {
case FIONREAD:
@@ -803,7 +803,7 @@ struct pipe_inode_info *alloc_pipe_info(void)
if (too_many_pipe_buffers_hard(user_bufs) && pipe_is_unprivileged_user())
goto out_revert_acct;
- pipe->bufs = kcalloc(pipe_bufs, sizeof(struct pipe_buffer),
+ pipe->bufs = kvcalloc(pipe_bufs, sizeof(struct pipe_buffer),
GFP_KERNEL_ACCOUNT);
if (pipe->bufs) {
@@ -828,7 +828,7 @@ out_free_uid:
void free_pipe_info(struct pipe_inode_info *pipe)
{
- int i;
+ unsigned int i;
#ifdef CONFIG_WATCH_QUEUE
if (pipe->watch_queue) {
@@ -846,7 +846,7 @@ void free_pipe_info(struct pipe_inode_info *pipe)
}
if (pipe->tmp_page)
__free_page(pipe->tmp_page);
- kfree(pipe->bufs);
+ kvfree(pipe->bufs);
kfree(pipe);
}
@@ -1261,8 +1261,7 @@ int pipe_resize_ring(struct pipe_inode_info *pipe, unsigned int nr_slots)
if (nr_slots < n)
return -EBUSY;
- bufs = kcalloc(nr_slots, sizeof(*bufs),
- GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
+ bufs = kvcalloc(nr_slots, sizeof(*bufs), GFP_KERNEL_ACCOUNT);
if (unlikely(!bufs))
return -ENOMEM;
@@ -1289,7 +1288,7 @@ int pipe_resize_ring(struct pipe_inode_info *pipe, unsigned int nr_slots)
head = n;
tail = 0;
- kfree(pipe->bufs);
+ kvfree(pipe->bufs);
pipe->bufs = bufs;
pipe->ring_size = nr_slots;
if (pipe->max_usage > nr_slots)
diff --git a/fs/proc/base.c b/fs/proc/base.c
index d654ce7150fdd..76bf1aa3cfe88 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1764,25 +1764,25 @@ out:
static int do_proc_readlink(struct path *path, char __user *buffer, int buflen)
{
- char *tmp = (char *)__get_free_page(GFP_KERNEL);
+ char *tmp = kmalloc(PATH_MAX, GFP_KERNEL);
char *pathname;
int len;
if (!tmp)
return -ENOMEM;
- pathname = d_path(path, tmp, PAGE_SIZE);
+ pathname = d_path(path, tmp, PATH_MAX);
len = PTR_ERR(pathname);
if (IS_ERR(pathname))
goto out;
- len = tmp + PAGE_SIZE - 1 - pathname;
+ len = tmp + PATH_MAX - 1 - pathname;
if (len > buflen)
len = buflen;
if (copy_to_user(buffer, pathname, len))
len = -EFAULT;
out:
- free_page((unsigned long)tmp);
+ kfree(tmp);
return len;
}
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index 172c86270b312..913bef0d2a36c 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -72,7 +72,7 @@ out:
return 0;
}
-static int seq_fdinfo_open(struct inode *inode, struct file *file)
+static int proc_fdinfo_access_allowed(struct inode *inode)
{
bool allowed = false;
struct task_struct *task = get_proc_task(inode);
@@ -86,6 +86,16 @@ static int seq_fdinfo_open(struct inode *inode, struct file *file)
if (!allowed)
return -EACCES;
+ return 0;
+}
+
+static int seq_fdinfo_open(struct inode *inode, struct file *file)
+{
+ int ret = proc_fdinfo_access_allowed(inode);
+
+ if (ret)
+ return ret;
+
return single_open(file, seq_show, inode);
}
@@ -348,12 +358,23 @@ static int proc_readfdinfo(struct file *file, struct dir_context *ctx)
proc_fdinfo_instantiate);
}
+static int proc_open_fdinfo(struct inode *inode, struct file *file)
+{
+ int ret = proc_fdinfo_access_allowed(inode);
+
+ if (ret)
+ return ret;
+
+ return 0;
+}
+
const struct inode_operations proc_fdinfo_inode_operations = {
.lookup = proc_lookupfdinfo,
.setattr = proc_setattr,
};
const struct file_operations proc_fdinfo_operations = {
+ .open = proc_open_fdinfo,
.read = generic_read_dir,
.iterate_shared = proc_readfdinfo,
.llseek = generic_file_llseek,
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 9f1077d94cde1..4dcbcd506cb6e 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -115,7 +115,10 @@ u64 stable_page_flags(struct page *page)
* it differentiates a memory hole from a page with no flags
*/
if (!page)
- return 1 << KPF_NOPAGE;
+ return BIT_ULL(KPF_NOPAGE);
+
+ if (pfn_zone_device_reserved(page_to_pfn(page)))
+ return BIT_ULL(KPF_RESERVED);
k = page->flags;
u = 0;
@@ -127,22 +130,22 @@ u64 stable_page_flags(struct page *page)
* simple test in page_mapped() is not enough.
*/
if (!PageSlab(page) && page_mapped(page))
- u |= 1 << KPF_MMAP;
+ u |= BIT_ULL(KPF_MMAP);
if (PageAnon(page))
- u |= 1 << KPF_ANON;
+ u |= BIT_ULL(KPF_ANON);
if (PageKsm(page))
- u |= 1 << KPF_KSM;
+ u |= BIT_ULL(KPF_KSM);
/*
* compound pages: export both head/tail info
* they together define a compound page's start/end pos and order
*/
if (PageHead(page))
- u |= 1 << KPF_COMPOUND_HEAD;
+ u |= BIT_ULL(KPF_COMPOUND_HEAD);
if (PageTail(page))
- u |= 1 << KPF_COMPOUND_TAIL;
+ u |= BIT_ULL(KPF_COMPOUND_TAIL);
if (PageHuge(page))
- u |= 1 << KPF_HUGE;
+ u |= BIT_ULL(KPF_HUGE);
/*
* PageTransCompound can be true for non-huge compound pages (slab
* pages or pages allocated by drivers with __GFP_COMP) because it
@@ -153,14 +156,13 @@ u64 stable_page_flags(struct page *page)
struct page *head = compound_head(page);
if (PageLRU(head) || PageAnon(head))
- u |= 1 << KPF_THP;
+ u |= BIT_ULL(KPF_THP);
else if (is_huge_zero_page(head)) {
- u |= 1 << KPF_ZERO_PAGE;
- u |= 1 << KPF_THP;
+ u |= BIT_ULL(KPF_ZERO_PAGE);
+ u |= BIT_ULL(KPF_THP);
}
} else if (is_zero_pfn(page_to_pfn(page)))
- u |= 1 << KPF_ZERO_PAGE;
-
+ u |= BIT_ULL(KPF_ZERO_PAGE);
/*
* Caveats on high order pages: page->_refcount will only be set
@@ -168,23 +170,23 @@ u64 stable_page_flags(struct page *page)
* SLOB won't set PG_slab at all on compound pages.
*/
if (PageBuddy(page))
- u |= 1 << KPF_BUDDY;
+ u |= BIT_ULL(KPF_BUDDY);
else if (page_count(page) == 0 && is_free_buddy_page(page))
- u |= 1 << KPF_BUDDY;
+ u |= BIT_ULL(KPF_BUDDY);
if (PageOffline(page))
- u |= 1 << KPF_OFFLINE;
+ u |= BIT_ULL(KPF_OFFLINE);
if (PageTable(page))
- u |= 1 << KPF_PGTABLE;
+ u |= BIT_ULL(KPF_PGTABLE);
if (page_is_idle(page))
- u |= 1 << KPF_IDLE;
+ u |= BIT_ULL(KPF_IDLE);
u |= kpf_copy_bit(k, KPF_LOCKED, PG_locked);
u |= kpf_copy_bit(k, KPF_SLAB, PG_slab);
if (PageTail(page) && PageSlab(compound_head(page)))
- u |= 1 << KPF_SLAB;
+ u |= BIT_ULL(KPF_SLAB);
u |= kpf_copy_bit(k, KPF_ERROR, PG_error);
u |= kpf_copy_bit(k, KPF_DIRTY, PG_dirty);
@@ -197,7 +199,7 @@ u64 stable_page_flags(struct page *page)
u |= kpf_copy_bit(k, KPF_RECLAIM, PG_reclaim);
if (PageSwapCache(page))
- u |= 1 << KPF_SWAPCACHE;
+ u |= BIT_ULL(KPF_SWAPCACHE);
u |= kpf_copy_bit(k, KPF_SWAPBACKED, PG_swapbacked);
u |= kpf_copy_bit(k, KPF_UNEVICTABLE, PG_unevictable);
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 18f8c3acbb85e..6e97ed775074c 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -440,7 +440,8 @@ static void smaps_page_accumulate(struct mem_size_stats *mss,
}
static void smaps_account(struct mem_size_stats *mss, struct page *page,
- bool compound, bool young, bool dirty, bool locked)
+ bool compound, bool young, bool dirty, bool locked,
+ bool migration)
{
int i, nr = compound ? compound_nr(page) : 1;
unsigned long size = nr * PAGE_SIZE;
@@ -467,8 +468,15 @@ static void smaps_account(struct mem_size_stats *mss, struct page *page,
* page_count(page) == 1 guarantees the page is mapped exactly once.
* If any subpage of the compound page mapped with PTE it would elevate
* page_count().
+ *
+ * The page_mapcount() is called to get a snapshot of the mapcount.
+ * Without holding the page lock this snapshot can be slightly wrong as
+ * we cannot always read the mapcount atomically. It is not safe to
+ * call page_mapcount() even with PTL held if the page is not mapped,
+ * especially for migration entries. Treat regular migration entries
+ * as mapcount == 1.
*/
- if (page_count(page) == 1) {
+ if ((page_count(page) == 1) || migration) {
smaps_page_accumulate(mss, page, size, size << PSS_SHIFT, dirty,
locked, true);
return;
@@ -517,6 +525,7 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
struct vm_area_struct *vma = walk->vma;
bool locked = !!(vma->vm_flags & VM_LOCKED);
struct page *page = NULL;
+ bool migration = false;
if (pte_present(*pte)) {
page = vm_normal_page(vma, addr, *pte);
@@ -536,8 +545,11 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
} else {
mss->swap_pss += (u64)PAGE_SIZE << PSS_SHIFT;
}
- } else if (is_pfn_swap_entry(swpent))
+ } else if (is_pfn_swap_entry(swpent)) {
+ if (is_migration_entry(swpent))
+ migration = true;
page = pfn_swap_entry_to_page(swpent);
+ }
} else {
smaps_pte_hole_lookup(addr, walk);
return;
@@ -546,7 +558,8 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
if (!page)
return;
- smaps_account(mss, page, false, pte_young(*pte), pte_dirty(*pte), locked);
+ smaps_account(mss, page, false, pte_young(*pte), pte_dirty(*pte),
+ locked, migration);
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -557,6 +570,7 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
struct vm_area_struct *vma = walk->vma;
bool locked = !!(vma->vm_flags & VM_LOCKED);
struct page *page = NULL;
+ bool migration = false;
if (pmd_present(*pmd)) {
/* FOLL_DUMP will return -EFAULT on huge zero page */
@@ -564,8 +578,10 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
} else if (unlikely(thp_migration_supported() && is_swap_pmd(*pmd))) {
swp_entry_t entry = pmd_to_swp_entry(*pmd);
- if (is_migration_entry(entry))
+ if (is_migration_entry(entry)) {
+ migration = true;
page = pfn_swap_entry_to_page(entry);
+ }
}
if (IS_ERR_OR_NULL(page))
return;
@@ -577,7 +593,9 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
/* pass */;
else
mss->file_thp += HPAGE_PMD_SIZE;
- smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd), locked);
+
+ smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd),
+ locked, migration);
}
#else
static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
@@ -1378,6 +1396,7 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm,
{
u64 frame = 0, flags = 0;
struct page *page = NULL;
+ bool migration = false;
if (pte_present(pte)) {
if (pm->show_pfn)
@@ -1399,13 +1418,14 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm,
frame = swp_type(entry) |
(swp_offset(entry) << MAX_SWAPFILES_SHIFT);
flags |= PM_SWAP;
+ migration = is_migration_entry(entry);
if (is_pfn_swap_entry(entry))
page = pfn_swap_entry_to_page(entry);
}
if (page && !PageAnon(page))
flags |= PM_FILE;
- if (page && page_mapcount(page) == 1)
+ if (page && !migration && page_mapcount(page) == 1)
flags |= PM_MMAP_EXCLUSIVE;
if (vma->vm_flags & VM_SOFTDIRTY)
flags |= PM_SOFT_DIRTY;
@@ -1421,8 +1441,9 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
spinlock_t *ptl;
pte_t *pte, *orig_pte;
int err = 0;
-
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ bool migration = false;
+
ptl = pmd_trans_huge_lock(pmdp, vma);
if (ptl) {
u64 flags = 0, frame = 0;
@@ -1461,11 +1482,12 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
if (pmd_swp_uffd_wp(pmd))
flags |= PM_UFFD_WP;
VM_BUG_ON(!is_pmd_migration_entry(pmd));
+ migration = is_migration_entry(entry);
page = pfn_swap_entry_to_page(entry);
}
#endif
- if (page && page_mapcount(page) == 1)
+ if (page && !migration && page_mapcount(page) == 1)
flags |= PM_MMAP_EXCLUSIVE;
for (; addr != end; addr += PAGE_SIZE) {
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 702754dd1daff..6f1b8ddc6f7a4 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -62,7 +62,8 @@ core_param(novmcoredd, vmcoredd_disabled, bool, 0);
/* Device Dump Size */
static size_t vmcoredd_orig_sz;
-static DECLARE_RWSEM(vmcore_cb_rwsem);
+static DEFINE_SPINLOCK(vmcore_cb_lock);
+DEFINE_STATIC_SRCU(vmcore_cb_srcu);
/* List of registered vmcore callbacks. */
static LIST_HEAD(vmcore_cb_list);
/* Whether the vmcore has been opened once. */
@@ -70,8 +71,8 @@ static bool vmcore_opened;
void register_vmcore_cb(struct vmcore_cb *cb)
{
- down_write(&vmcore_cb_rwsem);
INIT_LIST_HEAD(&cb->next);
+ spin_lock(&vmcore_cb_lock);
list_add_tail(&cb->next, &vmcore_cb_list);
/*
* Registering a vmcore callback after the vmcore was opened is
@@ -79,14 +80,14 @@ void register_vmcore_cb(struct vmcore_cb *cb)
*/
if (vmcore_opened)
pr_warn_once("Unexpected vmcore callback registration\n");
- up_write(&vmcore_cb_rwsem);
+ spin_unlock(&vmcore_cb_lock);
}
EXPORT_SYMBOL_GPL(register_vmcore_cb);
void unregister_vmcore_cb(struct vmcore_cb *cb)
{
- down_write(&vmcore_cb_rwsem);
- list_del(&cb->next);
+ spin_lock(&vmcore_cb_lock);
+ list_del_rcu(&cb->next);
/*
* Unregistering a vmcore callback after the vmcore was opened is
* very unusual (e.g., forced driver removal), but we cannot stop
@@ -94,7 +95,9 @@ void unregister_vmcore_cb(struct vmcore_cb *cb)
*/
if (vmcore_opened)
pr_warn_once("Unexpected vmcore callback unregistration\n");
- up_write(&vmcore_cb_rwsem);
+ spin_unlock(&vmcore_cb_lock);
+
+ synchronize_srcu(&vmcore_cb_srcu);
}
EXPORT_SYMBOL_GPL(unregister_vmcore_cb);
@@ -103,9 +106,8 @@ static bool pfn_is_ram(unsigned long pfn)
struct vmcore_cb *cb;
bool ret = true;
- lockdep_assert_held_read(&vmcore_cb_rwsem);
-
- list_for_each_entry(cb, &vmcore_cb_list, next) {
+ list_for_each_entry_srcu(cb, &vmcore_cb_list, next,
+ srcu_read_lock_held(&vmcore_cb_srcu)) {
if (unlikely(!cb->pfn_is_ram))
continue;
ret = cb->pfn_is_ram(cb, pfn);
@@ -118,9 +120,9 @@ static bool pfn_is_ram(unsigned long pfn)
static int open_vmcore(struct inode *inode, struct file *file)
{
- down_read(&vmcore_cb_rwsem);
+ spin_lock(&vmcore_cb_lock);
vmcore_opened = true;
- up_read(&vmcore_cb_rwsem);
+ spin_unlock(&vmcore_cb_lock);
return 0;
}
@@ -133,6 +135,7 @@ ssize_t read_from_oldmem(char *buf, size_t count,
unsigned long pfn, offset;
size_t nr_bytes;
ssize_t read = 0, tmp;
+ int idx;
if (!count)
return 0;
@@ -140,7 +143,7 @@ ssize_t read_from_oldmem(char *buf, size_t count,
offset = (unsigned long)(*ppos % PAGE_SIZE);
pfn = (unsigned long)(*ppos / PAGE_SIZE);
- down_read(&vmcore_cb_rwsem);
+ idx = srcu_read_lock(&vmcore_cb_srcu);
do {
if (count > (PAGE_SIZE - offset))
nr_bytes = PAGE_SIZE - offset;
@@ -165,7 +168,7 @@ ssize_t read_from_oldmem(char *buf, size_t count,
offset, userbuf);
}
if (tmp < 0) {
- up_read(&vmcore_cb_rwsem);
+ srcu_read_unlock(&vmcore_cb_srcu, idx);
return tmp;
}
@@ -176,8 +179,8 @@ ssize_t read_from_oldmem(char *buf, size_t count,
++pfn;
offset = 0;
} while (count);
+ srcu_read_unlock(&vmcore_cb_srcu, idx);
- up_read(&vmcore_cb_rwsem);
return read;
}
@@ -477,7 +480,7 @@ static const struct vm_operations_struct vmcore_mmap_ops = {
/**
* vmcore_alloc_buf - allocate buffer in vmalloc memory
- * @sizez: size of buffer
+ * @size: size of buffer
*
* If CONFIG_MMU is defined, use vmalloc_user() to allow users to mmap
* the buffer to user-space by means of remap_vmalloc_range().
@@ -568,18 +571,18 @@ static int vmcore_remap_oldmem_pfn(struct vm_area_struct *vma,
unsigned long from, unsigned long pfn,
unsigned long size, pgprot_t prot)
{
- int ret;
+ int ret, idx;
/*
- * Check if oldmem_pfn_is_ram was registered to avoid
- * looping over all pages without a reason.
+ * Check if a callback was registered to avoid looping over all
+ * pages without a reason.
*/
- down_read(&vmcore_cb_rwsem);
+ idx = srcu_read_lock(&vmcore_cb_srcu);
if (!list_empty(&vmcore_cb_list))
ret = remap_oldmem_pfn_checked(vma, from, pfn, size, prot);
else
ret = remap_oldmem_pfn_range(vma, from, pfn, size, prot);
- up_read(&vmcore_cb_rwsem);
+ srcu_read_unlock(&vmcore_cb_srcu, idx);
return ret;
}
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index ae87fd95b17e2..44795018df1d8 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -843,9 +843,6 @@ xfs_buf_readahead_map(
{
struct xfs_buf *bp;
- if (bdi_read_congested(target->bt_bdev->bd_disk->bdi))
- return;
-
xfs_buf_read_map(target, map, nmaps,
XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD, &bp, ops,
__this_address);
diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
index 993c5628a7263..e863c88df95f9 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -207,14 +207,6 @@ struct backing_dev_info {
#endif
};
-enum {
- BLK_RW_ASYNC = 0,
- BLK_RW_SYNC = 1,
-};
-
-void clear_bdi_congested(struct backing_dev_info *bdi, int sync);
-void set_bdi_congested(struct backing_dev_info *bdi, int sync);
-
struct wb_lock_cookie {
bool locked;
unsigned long flags;
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 483979c1b9f43..87ce24d238f34 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -135,13 +135,6 @@ static inline bool writeback_in_progress(struct bdi_writeback *wb)
struct backing_dev_info *inode_to_bdi(struct inode *inode);
-static inline int wb_congested(struct bdi_writeback *wb, int cong_bits)
-{
- return wb->congested & cong_bits;
-}
-
-long congestion_wait(int sync, long timeout);
-
static inline bool mapping_can_writeback(struct address_space *mapping)
{
return inode_to_bdi(mapping->host)->capabilities & BDI_CAP_WRITEBACK;
@@ -162,7 +155,6 @@ struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi,
gfp_t gfp);
void wb_memcg_offline(struct mem_cgroup *memcg);
void wb_blkcg_offline(struct blkcg *blkcg);
-int inode_congested(struct inode *inode, int cong_bits);
/**
* inode_cgwb_enabled - test whether cgroup writeback is enabled on an inode
@@ -390,50 +382,8 @@ static inline void wb_blkcg_offline(struct blkcg *blkcg)
{
}
-static inline int inode_congested(struct inode *inode, int cong_bits)
-{
- return wb_congested(&inode_to_bdi(inode)->wb, cong_bits);
-}
-
#endif /* CONFIG_CGROUP_WRITEBACK */
-static inline int inode_read_congested(struct inode *inode)
-{
- return inode_congested(inode, 1 << WB_sync_congested);
-}
-
-static inline int inode_write_congested(struct inode *inode)
-{
- return inode_congested(inode, 1 << WB_async_congested);
-}
-
-static inline int inode_rw_congested(struct inode *inode)
-{
- return inode_congested(inode, (1 << WB_sync_congested) |
- (1 << WB_async_congested));
-}
-
-static inline int bdi_congested(struct backing_dev_info *bdi, int cong_bits)
-{
- return wb_congested(&bdi->wb, cong_bits);
-}
-
-static inline int bdi_read_congested(struct backing_dev_info *bdi)
-{
- return bdi_congested(bdi, 1 << WB_sync_congested);
-}
-
-static inline int bdi_write_congested(struct backing_dev_info *bdi)
-{
- return bdi_congested(bdi, 1 << WB_async_congested);
-}
-
-static inline int bdi_rw_congested(struct backing_dev_info *bdi)
-{
- return bdi_congested(bdi, (1 << WB_sync_congested) |
- (1 << WB_async_congested));
-}
-
const char *bdi_dev_name(struct backing_dev_info *bdi);
#endif /* _LINUX_BACKING_DEV_H */
diff --git a/include/linux/balloon_compaction.h b/include/linux/balloon_compaction.h
index 338aa27e4773b..edb7f6d41faa0 100644
--- a/include/linux/balloon_compaction.h
+++ b/include/linux/balloon_compaction.h
@@ -80,12 +80,6 @@ static inline void balloon_devinfo_init(struct balloon_dev_info *balloon)
#ifdef CONFIG_BALLOON_COMPACTION
extern const struct address_space_operations balloon_aops;
-extern bool balloon_page_isolate(struct page *page,
- isolate_mode_t mode);
-extern void balloon_page_putback(struct page *page);
-extern int balloon_page_migrate(struct address_space *mapping,
- struct page *newpage,
- struct page *page, enum migrate_mode mode);
/*
* balloon_page_insert - insert a page into the balloon's page list and make
@@ -155,22 +149,6 @@ static inline void balloon_page_delete(struct page *page)
list_del(&page->lru);
}
-static inline bool balloon_page_isolate(struct page *page)
-{
- return false;
-}
-
-static inline void balloon_page_putback(struct page *page)
-{
- return;
-}
-
-static inline int balloon_page_migrate(struct page *newpage,
- struct page *page, enum migrate_mode mode)
-{
- return 0;
-}
-
static inline gfp_t balloon_mapping_gfp_mask(void)
{
return GFP_HIGHUSER;
diff --git a/include/linux/bitfield.h b/include/linux/bitfield.h
index 6093fa6db2600..c9be1657f03d9 100644
--- a/include/linux/bitfield.h
+++ b/include/linux/bitfield.h
@@ -19,6 +19,9 @@
*
* Example:
*
+ * #include <linux/bitfield.h>
+ * #include <linux/bits.h>
+ *
* #define REG_FIELD_A GENMASK(6, 0)
* #define REG_FIELD_B BIT(7)
* #define REG_FIELD_C GENMASK(15, 8)
diff --git a/include/linux/cma.h b/include/linux/cma.h
index bd801023504b2..51d540eee18ac 100644
--- a/include/linux/cma.h
+++ b/include/linux/cma.h
@@ -50,4 +50,6 @@ extern bool cma_pages_valid(struct cma *cma, const struct page *pages, unsigned
extern bool cma_release(struct cma *cma, const struct page *pages, unsigned long count);
extern int cma_for_each_area(int (*it)(struct cma *cma, void *data), void *data);
+
+extern void cma_reserve_pages_on_error(struct cma *cma);
#endif
diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h
index 3f31ff400432b..9102ea2e4f8fe 100644
--- a/include/linux/compiler_types.h
+++ b/include/linux/compiler_types.h
@@ -140,8 +140,6 @@ struct ftrace_likely_data {
*/
#define __naked __attribute__((__naked__)) notrace
-#define __compiler_offsetof(a, b) __builtin_offsetof(a, b)
-
/*
* Prefer gnu_inline, so that extern inline functions do not emit an
* externally visible function. This makes extern inline behave as per gnu89
diff --git a/include/linux/damon.h b/include/linux/damon.h
index 5e1e3a128b77a..7c1d915b35875 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -60,19 +60,18 @@ struct damon_region {
/**
* struct damon_target - Represents a monitoring target.
- * @id: Unique identifier for this target.
+ * @pid: The PID of the virtual address space to monitor.
* @nr_regions: Number of monitoring target regions of this target.
* @regions_list: Head of the monitoring target regions of this target.
* @list: List head for siblings.
*
* Each monitoring context could have multiple targets. For example, a context
* for virtual memory address spaces could have multiple target processes. The
- * @id of each target should be unique among the targets of the context. For
- * example, in the virtual address monitoring context, it could be a pidfd or
- * an address of an mm_struct.
+ * @pid should be set for appropriate address space monitoring primitives
+ * including the virtual address spaces monitoring primitives.
*/
struct damon_target {
- unsigned long id;
+ struct pid *pid;
unsigned int nr_regions;
struct list_head regions_list;
struct list_head list;
@@ -475,7 +474,7 @@ struct damos *damon_new_scheme(
void damon_add_scheme(struct damon_ctx *ctx, struct damos *s);
void damon_destroy_scheme(struct damos *s);
-struct damon_target *damon_new_target(unsigned long id);
+struct damon_target *damon_new_target(void);
void damon_add_target(struct damon_ctx *ctx, struct damon_target *t);
bool damon_targets_empty(struct damon_ctx *ctx);
void damon_free_target(struct damon_target *t);
@@ -484,8 +483,6 @@ unsigned int damon_nr_regions(struct damon_target *t);
struct damon_ctx *damon_new_ctx(void);
void damon_destroy_ctx(struct damon_ctx *ctx);
-int damon_set_targets(struct damon_ctx *ctx,
- unsigned long *ids, ssize_t nr_ids);
int damon_set_attrs(struct damon_ctx *ctx, unsigned long sample_int,
unsigned long aggr_int, unsigned long primitive_upd_int,
unsigned long min_nr_reg, unsigned long max_nr_reg);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index ce5aabb609cd2..7c128f303b3d4 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1435,6 +1435,7 @@ extern int send_sigurg(struct fown_struct *fown);
#define SB_I_SKIP_SYNC 0x00000100 /* Skip superblock at global sync */
#define SB_I_PERSB_BDI 0x00000200 /* has a per-sb bdi */
+#define SB_I_TS_EXPIRY_WARNED 0x00000400 /* warned about timestamp range expiry */
/* Possible states of 'frozen' field */
enum {
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 80f63c862be57..6eef3e4475401 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -39,7 +39,7 @@ struct vm_area_struct;
#define ___GFP_IO 0x40u
#define ___GFP_FS 0x80u
#define ___GFP_ZERO 0x100u
-#define ___GFP_ATOMIC 0x200u
+/* 0x200u unused */
#define ___GFP_DIRECT_RECLAIM 0x400u
#define ___GFP_KSWAPD_RECLAIM 0x800u
#define ___GFP_WRITE 0x1000u
@@ -54,9 +54,17 @@ struct vm_area_struct;
#define ___GFP_THISNODE 0x200000u
#define ___GFP_ACCOUNT 0x400000u
#define ___GFP_ZEROTAGS 0x800000u
-#define ___GFP_SKIP_KASAN_POISON 0x1000000u
+#ifdef CONFIG_KASAN_HW_TAGS
+#define ___GFP_SKIP_ZERO 0x1000000u
+#define ___GFP_SKIP_KASAN_UNPOISON 0x2000000u
+#define ___GFP_SKIP_KASAN_POISON 0x4000000u
+#else
+#define ___GFP_SKIP_ZERO 0
+#define ___GFP_SKIP_KASAN_UNPOISON 0
+#define ___GFP_SKIP_KASAN_POISON 0
+#endif
#ifdef CONFIG_LOCKDEP
-#define ___GFP_NOLOCKDEP 0x2000000u
+#define ___GFP_NOLOCKDEP 0x8000000u
#else
#define ___GFP_NOLOCKDEP 0
#endif
@@ -116,11 +124,8 @@ struct vm_area_struct;
*
* %__GFP_HIGH indicates that the caller is high-priority and that granting
* the request is necessary before the system can make forward progress.
- * For example, creating an IO context to clean pages.
- *
- * %__GFP_ATOMIC indicates that the caller cannot reclaim or sleep and is
- * high priority. Users are typically interrupt handlers. This may be
- * used in conjunction with %__GFP_HIGH
+ * For example creating an IO context to clean pages and requests
+ * from atomic context.
*
* %__GFP_MEMALLOC allows access to all memory. This should only be used when
* the caller guarantees the allocation will allow more memory to be freed
@@ -135,7 +140,6 @@ struct vm_area_struct;
* %__GFP_NOMEMALLOC is used to explicitly forbid access to emergency reserves.
* This takes precedence over the %__GFP_MEMALLOC flag if both are set.
*/
-#define __GFP_ATOMIC ((__force gfp_t)___GFP_ATOMIC)
#define __GFP_HIGH ((__force gfp_t)___GFP_HIGH)
#define __GFP_MEMALLOC ((__force gfp_t)___GFP_MEMALLOC)
#define __GFP_NOMEMALLOC ((__force gfp_t)___GFP_NOMEMALLOC)
@@ -232,24 +236,33 @@ struct vm_area_struct;
*
* %__GFP_ZERO returns a zeroed page on success.
*
- * %__GFP_ZEROTAGS returns a page with zeroed memory tags on success, if
- * __GFP_ZERO is set.
+ * %__GFP_ZEROTAGS zeroes memory tags at allocation time if the memory itself
+ * is being zeroed (either via __GFP_ZERO or via init_on_alloc, provided that
+ * __GFP_SKIP_ZERO is not set). This flag is intended for optimization: setting
+ * memory tags at the same time as zeroing memory has minimal additional
+ * performace impact.
+ *
+ * %__GFP_SKIP_KASAN_UNPOISON makes KASAN skip unpoisoning on page allocation.
+ * Only effective in HW_TAGS mode.
*
- * %__GFP_SKIP_KASAN_POISON returns a page which does not need to be poisoned
- * on deallocation. Typically used for userspace pages. Currently only has an
- * effect in HW tags mode.
+ * %__GFP_SKIP_KASAN_POISON makes KASAN skip poisoning on page deallocation.
+ * Typically, used for userspace pages. Only effective in HW_TAGS mode.
*/
#define __GFP_NOWARN ((__force gfp_t)___GFP_NOWARN)
#define __GFP_COMP ((__force gfp_t)___GFP_COMP)
#define __GFP_ZERO ((__force gfp_t)___GFP_ZERO)
#define __GFP_ZEROTAGS ((__force gfp_t)___GFP_ZEROTAGS)
-#define __GFP_SKIP_KASAN_POISON ((__force gfp_t)___GFP_SKIP_KASAN_POISON)
+#define __GFP_SKIP_ZERO ((__force gfp_t)___GFP_SKIP_ZERO)
+#define __GFP_SKIP_KASAN_UNPOISON ((__force gfp_t)___GFP_SKIP_KASAN_UNPOISON)
+#define __GFP_SKIP_KASAN_POISON ((__force gfp_t)___GFP_SKIP_KASAN_POISON)
/* Disable lockdep for GFP context tracking */
#define __GFP_NOLOCKDEP ((__force gfp_t)___GFP_NOLOCKDEP)
/* Room for N __GFP_FOO bits */
-#define __GFP_BITS_SHIFT (25 + IS_ENABLED(CONFIG_LOCKDEP))
+#define __GFP_BITS_SHIFT (24 + \
+ 3 * IS_ENABLED(CONFIG_KASAN_HW_TAGS) + \
+ IS_ENABLED(CONFIG_LOCKDEP))
#define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1))
/**
@@ -322,7 +335,7 @@ struct vm_area_struct;
* version does not attempt reclaim/compaction at all and is by default used
* in page fault path, while the non-light is used by khugepaged.
*/
-#define GFP_ATOMIC (__GFP_HIGH|__GFP_ATOMIC|__GFP_KSWAPD_RECLAIM)
+#define GFP_ATOMIC (__GFP_HIGH|__GFP_KSWAPD_RECLAIM)
#define GFP_KERNEL (__GFP_RECLAIM | __GFP_IO | __GFP_FS)
#define GFP_KERNEL_ACCOUNT (GFP_KERNEL | __GFP_ACCOUNT)
#define GFP_NOWAIT (__GFP_KSWAPD_RECLAIM)
diff --git a/include/linux/highmem-internal.h b/include/linux/highmem-internal.h
index 0a0b2b09b1b8d..a77be56302094 100644
--- a/include/linux/highmem-internal.h
+++ b/include/linux/highmem-internal.h
@@ -246,6 +246,16 @@ do { \
__kunmap_atomic(__addr); \
} while (0)
+/**
+ * kunmap_local - Unmap a page mapped via kmap_local_page().
+ * @__addr: An address within the page mapped
+ *
+ * @__addr can be any address within the mapped page. Commonly it is the
+ * address return from kmap_local_page(), but it can also include offsets.
+ *
+ * Unmapping should be done in the reverse order of the mapping. See
+ * kmap_local_page() for details.
+ */
#define kunmap_local(__addr) \
do { \
BUILD_BUG_ON(__same_type((__addr), struct page *)); \
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index d1897a69c5406..08357b4c7be73 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -754,7 +754,7 @@ static inline void arch_clear_hugepage_flags(struct page *page) { }
static inline pte_t arch_make_huge_pte(pte_t entry, unsigned int shift,
vm_flags_t flags)
{
- return entry;
+ return pte_mkhuge(entry);
}
#endif
@@ -1075,12 +1075,6 @@ static inline void set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr
}
#endif /* CONFIG_HUGETLB_PAGE */
-#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
-extern bool hugetlb_free_vmemmap_enabled;
-#else
-#define hugetlb_free_vmemmap_enabled false
-#endif
-
static inline spinlock_t *huge_pte_lock(struct hstate *h,
struct mm_struct *mm, pte_t *pte)
{
diff --git a/include/linux/kallsyms.h b/include/linux/kallsyms.h
index 5fb17dd4b6fad..ebfeb6099c283 100644
--- a/include/linux/kallsyms.h
+++ b/include/linux/kallsyms.h
@@ -163,6 +163,33 @@ static inline bool kallsyms_show_value(const struct cred *cred)
return false;
}
+#ifdef CONFIG_MODULES
+static inline int fill_minimal_module_info(char *sym, int size, unsigned long value)
+{
+ struct module *mod;
+ unsigned long offset;
+ int ret = 0;
+
+ preempt_disable();
+ mod = __module_address(value);
+ if (mod) {
+ offset = value - (unsigned long)mod->core_layout.base;
+ snprintf(sym, size - 1, "0x%lx+0x%lx [%s]",
+ (unsigned long)mod->core_layout.base, offset, mod->name);
+
+ sym[size - 1] = '\0';
+ ret = 1;
+ }
+
+ preempt_enable();
+ return ret;
+}
+#else
+static inline int fill_minimal_module_info(char *sym, int size, unsigned long value)
+{
+ return 0;
+}
+#endif /*CONFIG_MODULES*/
#endif /*CONFIG_KALLSYMS*/
static inline void print_ip_sym(const char *loglvl, unsigned long ip)
diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index 4a45562d88937..3593c95d1fa54 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -25,6 +25,13 @@ struct kunit_kasan_expectation {
#endif
+typedef unsigned int __bitwise kasan_vmalloc_flags_t;
+
+#define KASAN_VMALLOC_NONE 0x00u
+#define KASAN_VMALLOC_INIT 0x01u
+#define KASAN_VMALLOC_VM_ALLOC 0x02u
+#define KASAN_VMALLOC_PROT_NORMAL 0x04u
+
#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
#include <linux/pgtable.h>
@@ -95,9 +102,6 @@ static inline bool kasan_hw_tags_enabled(void)
return kasan_enabled();
}
-void kasan_alloc_pages(struct page *page, unsigned int order, gfp_t flags);
-void kasan_free_pages(struct page *page, unsigned int order);
-
#else /* CONFIG_KASAN_HW_TAGS */
static inline bool kasan_enabled(void)
@@ -110,20 +114,6 @@ static inline bool kasan_hw_tags_enabled(void)
return false;
}
-static __always_inline void kasan_alloc_pages(struct page *page,
- unsigned int order, gfp_t flags)
-{
- /* Only available for integrated init. */
- BUILD_BUG();
-}
-
-static __always_inline void kasan_free_pages(struct page *page,
- unsigned int order)
-{
- /* Only available for integrated init. */
- BUILD_BUG();
-}
-
#endif /* CONFIG_KASAN_HW_TAGS */
static inline bool kasan_has_integrated_init(void)
@@ -435,34 +425,71 @@ static inline void kasan_init_hw_tags(void) { }
#ifdef CONFIG_KASAN_VMALLOC
+#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
+
+void kasan_populate_early_vm_area_shadow(void *start, unsigned long size);
int kasan_populate_vmalloc(unsigned long addr, unsigned long size);
-void kasan_poison_vmalloc(const void *start, unsigned long size);
-void kasan_unpoison_vmalloc(const void *start, unsigned long size);
void kasan_release_vmalloc(unsigned long start, unsigned long end,
unsigned long free_region_start,
unsigned long free_region_end);
-void kasan_populate_early_vm_area_shadow(void *start, unsigned long size);
+#else /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */
+
+static inline void kasan_populate_early_vm_area_shadow(void *start,
+ unsigned long size)
+{ }
+static inline int kasan_populate_vmalloc(unsigned long start,
+ unsigned long size)
+{
+ return 0;
+}
+static inline void kasan_release_vmalloc(unsigned long start,
+ unsigned long end,
+ unsigned long free_region_start,
+ unsigned long free_region_end) { }
+
+#endif /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */
+
+void *__kasan_unpoison_vmalloc(const void *start, unsigned long size,
+ kasan_vmalloc_flags_t flags);
+static __always_inline void *kasan_unpoison_vmalloc(const void *start,
+ unsigned long size,
+ kasan_vmalloc_flags_t flags)
+{
+ if (kasan_enabled())
+ return __kasan_unpoison_vmalloc(start, size, flags);
+ return (void *)start;
+}
+
+void __kasan_poison_vmalloc(const void *start, unsigned long size);
+static __always_inline void kasan_poison_vmalloc(const void *start,
+ unsigned long size)
+{
+ if (kasan_enabled())
+ __kasan_poison_vmalloc(start, size);
+}
#else /* CONFIG_KASAN_VMALLOC */
+static inline void kasan_populate_early_vm_area_shadow(void *start,
+ unsigned long size) { }
static inline int kasan_populate_vmalloc(unsigned long start,
unsigned long size)
{
return 0;
}
-
-static inline void kasan_poison_vmalloc(const void *start, unsigned long size)
-{ }
-static inline void kasan_unpoison_vmalloc(const void *start, unsigned long size)
-{ }
static inline void kasan_release_vmalloc(unsigned long start,
unsigned long end,
unsigned long free_region_start,
- unsigned long free_region_end) {}
+ unsigned long free_region_end) { }
-static inline void kasan_populate_early_vm_area_shadow(void *start,
- unsigned long size)
+static inline void *kasan_unpoison_vmalloc(const void *start,
+ unsigned long size,
+ kasan_vmalloc_flags_t flags)
+{
+ return (void *)start;
+}
+static inline void kasan_poison_vmalloc(const void *start, unsigned long size)
{ }
#endif /* CONFIG_KASAN_VMALLOC */
@@ -471,17 +498,17 @@ static inline void kasan_populate_early_vm_area_shadow(void *start,
!defined(CONFIG_KASAN_VMALLOC)
/*
- * These functions provide a special case to support backing module
- * allocations with real shadow memory. With KASAN vmalloc, the special
- * case is unnecessary, as the work is handled in the generic case.
+ * These functions allocate and free shadow memory for kernel modules.
+ * They are only required when KASAN_VMALLOC is not supported, as otherwise
+ * shadow memory is allocated by the generic vmalloc handlers.
*/
-int kasan_module_alloc(void *addr, size_t size, gfp_t gfp_mask);
-void kasan_free_shadow(const struct vm_struct *vm);
+int kasan_alloc_module_shadow(void *addr, size_t size, gfp_t gfp_mask);
+void kasan_free_module_shadow(const struct vm_struct *vm);
#else /* (CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS) && !CONFIG_KASAN_VMALLOC */
-static inline int kasan_module_alloc(void *addr, size_t size, gfp_t gfp_mask) { return 0; }
-static inline void kasan_free_shadow(const struct vm_struct *vm) {}
+static inline int kasan_alloc_module_shadow(void *addr, size_t size, gfp_t gfp_mask) { return 0; }
+static inline void kasan_free_module_shadow(const struct vm_struct *vm) {}
#endif /* (CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS) && !CONFIG_KASAN_VMALLOC */
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index 0c994ae37729e..58d1b58a971e3 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -20,6 +20,12 @@
#include <uapi/linux/kexec.h>
+/* Location of a reserved region to hold the crash kernel.
+ */
+extern struct resource crashk_res;
+extern struct resource crashk_low_res;
+extern note_buf_t __percpu *crash_notes;
+
#ifdef CONFIG_KEXEC_CORE
#include <linux/list.h>
#include <linux/compat.h>
@@ -350,12 +356,6 @@ extern int kexec_load_disabled;
#define KEXEC_FILE_FLAGS (KEXEC_FILE_UNLOAD | KEXEC_FILE_ON_CRASH | \
KEXEC_FILE_NO_INITRAMFS)
-/* Location of a reserved region to hold the crash kernel.
- */
-extern struct resource crashk_res;
-extern struct resource crashk_low_res;
-extern note_buf_t __percpu *crash_notes;
-
/* flag to track if kexec reboot is in progress */
extern bool kexec_in_progress;
diff --git a/include/linux/kfence.h b/include/linux/kfence.h
index 4b5e3679a72c7..f49e64222628a 100644
--- a/include/linux/kfence.h
+++ b/include/linux/kfence.h
@@ -17,6 +17,8 @@
#include <linux/atomic.h>
#include <linux/static_key.h>
+extern unsigned long kfence_sample_interval;
+
/*
* We allocate an even number of pages, as it simplifies calculations to map
* address to metadata indices; effectively, the very first page serves as an
diff --git a/include/linux/log2.h b/include/linux/log2.h
index df0b155c21417..9f30d087a1281 100644
--- a/include/linux/log2.h
+++ b/include/linux/log2.h
@@ -18,7 +18,7 @@
* - the arch is not required to handle n==0 if implementing the fallback
*/
#ifndef CONFIG_ARCH_HAS_ILOG2_U32
-static inline __attribute__((const))
+static __always_inline __attribute__((const))
int __ilog2_u32(u32 n)
{
return fls(n) - 1;
@@ -26,7 +26,7 @@ int __ilog2_u32(u32 n)
#endif
#ifndef CONFIG_ARCH_HAS_ILOG2_U64
-static inline __attribute__((const))
+static __always_inline __attribute__((const))
int __ilog2_u64(u64 n)
{
return fls64(n) - 1;
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index b72d75141e125..ef4b445392a9e 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -34,6 +34,7 @@ enum memcg_stat_item {
MEMCG_SOCK,
MEMCG_PERCPU_B,
MEMCG_VMALLOC,
+ MEMCG_KMEM,
MEMCG_NR_STAT,
};
@@ -219,7 +220,7 @@ struct obj_cgroup {
struct mem_cgroup *memcg;
atomic_t nr_charged_bytes;
union {
- struct list_head list;
+ struct list_head list; /* protected by objcg_lock */
struct rcu_head rcu;
};
};
@@ -315,7 +316,8 @@ struct mem_cgroup {
#ifdef CONFIG_MEMCG_KMEM
int kmemcg_id;
struct obj_cgroup __rcu *objcg;
- struct list_head objcg_list; /* list of inherited objcgs */
+ /* list of inherited objcgs, protected by objcg_lock */
+ struct list_head objcg_list;
#endif
MEMCG_PADDING(_pad2_);
@@ -840,9 +842,7 @@ static inline struct mem_cgroup *lruvec_memcg(struct lruvec *lruvec)
*/
static inline struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
{
- if (!memcg->memory.parent)
- return NULL;
- return mem_cgroup_from_counter(memcg->memory.parent, memory);
+ return mem_cgroup_from_css(memcg->css.parent);
}
static inline bool mem_cgroup_is_descendant(struct mem_cgroup *memcg,
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index be48e003a5183..76bf2de86defc 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -16,6 +16,62 @@ struct memory_group;
struct resource;
struct vmem_altmap;
+#ifdef CONFIG_HAVE_ARCH_NODEDATA_EXTENSION
+/*
+ * For supporting node-hotadd, we have to allocate a new pgdat.
+ *
+ * If an arch has generic style NODE_DATA(),
+ * node_data[nid] = kzalloc() works well. But it depends on the architecture.
+ *
+ * In general, generic_alloc_nodedata() is used.
+ *
+ */
+extern pg_data_t *arch_alloc_nodedata(int nid);
+extern void arch_refresh_nodedata(int nid, pg_data_t *pgdat);
+
+#else /* CONFIG_HAVE_ARCH_NODEDATA_EXTENSION */
+
+#define arch_alloc_nodedata(nid) generic_alloc_nodedata(nid)
+
+#ifdef CONFIG_NUMA
+/*
+ * XXX: node aware allocation can't work well to get new node's memory at this time.
+ * Because, pgdat for the new node is not allocated/initialized yet itself.
+ * To use new node's memory, more consideration will be necessary.
+ */
+#define generic_alloc_nodedata(nid) \
+({ \
+ memblock_alloc(sizeof(*pgdat), SMP_CACHE_BYTES); \
+})
+/*
+ * This definition is just for error path in node hotadd.
+ * For node hotremove, we have to replace this.
+ */
+#define generic_free_nodedata(pgdat) kfree(pgdat)
+
+extern pg_data_t *node_data[];
+static inline void arch_refresh_nodedata(int nid, pg_data_t *pgdat)
+{
+ node_data[nid] = pgdat;
+}
+
+#else /* !CONFIG_NUMA */
+
+/* never called */
+static inline pg_data_t *generic_alloc_nodedata(int nid)
+{
+ BUG();
+ return NULL;
+}
+static inline void generic_free_nodedata(pg_data_t *pgdat)
+{
+}
+static inline void arch_refresh_nodedata(int nid, pg_data_t *pgdat)
+{
+}
+#endif /* CONFIG_NUMA */
+#endif /* CONFIG_HAVE_ARCH_NODEDATA_EXTENSION */
+
#ifdef CONFIG_MEMORY_HOTPLUG
struct page *pfn_to_online_page(unsigned long pfn);
@@ -154,66 +210,6 @@ int add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages,
struct mhp_params *params);
#endif /* ARCH_HAS_ADD_PAGES */
-#ifdef CONFIG_HAVE_ARCH_NODEDATA_EXTENSION
-/*
- * For supporting node-hotadd, we have to allocate a new pgdat.
- *
- * If an arch has generic style NODE_DATA(),
- * node_data[nid] = kzalloc() works well. But it depends on the architecture.
- *
- * In general, generic_alloc_nodedata() is used.
- * Now, arch_free_nodedata() is just defined for error path of node_hot_add.
- *
- */
-extern pg_data_t *arch_alloc_nodedata(int nid);
-extern void arch_free_nodedata(pg_data_t *pgdat);
-extern void arch_refresh_nodedata(int nid, pg_data_t *pgdat);
-
-#else /* CONFIG_HAVE_ARCH_NODEDATA_EXTENSION */
-
-#define arch_alloc_nodedata(nid) generic_alloc_nodedata(nid)
-#define arch_free_nodedata(pgdat) generic_free_nodedata(pgdat)
-
-#ifdef CONFIG_NUMA
-/*
- * If ARCH_HAS_NODEDATA_EXTENSION=n, this func is used to allocate pgdat.
- * XXX: kmalloc_node() can't work well to get new node's memory at this time.
- * Because, pgdat for the new node is not allocated/initialized yet itself.
- * To use new node's memory, more consideration will be necessary.
- */
-#define generic_alloc_nodedata(nid) \
-({ \
- kzalloc(sizeof(pg_data_t), GFP_KERNEL); \
-})
-/*
- * This definition is just for error path in node hotadd.
- * For node hotremove, we have to replace this.
- */
-#define generic_free_nodedata(pgdat) kfree(pgdat)
-
-extern pg_data_t *node_data[];
-static inline void arch_refresh_nodedata(int nid, pg_data_t *pgdat)
-{
- node_data[nid] = pgdat;
-}
-
-#else /* !CONFIG_NUMA */
-
-/* never called */
-static inline pg_data_t *generic_alloc_nodedata(int nid)
-{
- BUG();
- return NULL;
-}
-static inline void generic_free_nodedata(pg_data_t *pgdat)
-{
-}
-static inline void arch_refresh_nodedata(int nid, pg_data_t *pgdat)
-{
-}
-#endif /* CONFIG_NUMA */
-#endif /* CONFIG_HAVE_ARCH_NODEDATA_EXTENSION */
-
void get_online_mems(void);
void put_online_mems(void);
@@ -323,7 +319,7 @@ extern void set_zone_contiguous(struct zone *zone);
extern void clear_zone_contiguous(struct zone *zone);
#ifdef CONFIG_MEMORY_HOTPLUG
-extern void __ref free_area_init_core_hotplug(int nid);
+extern void __ref free_area_init_core_hotplug(struct pglist_data *pgdat);
extern int __add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags);
extern int add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags);
extern int add_memory_resource(int nid, struct resource *resource,
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 668389b4b53d7..44383ab8af554 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -6,6 +6,7 @@
#ifndef _LINUX_MEMPOLICY_H
#define _LINUX_MEMPOLICY_H 1
+#include <linux/refcount.h>
#include <linux/sched.h>
#include <linux/mmzone.h>
#include <linux/slab.h>
@@ -42,7 +43,7 @@ struct mm_struct;
* to 1, representing the caller of mpol_dup().
*/
struct mempolicy {
- atomic_t refcnt;
+ refcount_t refcnt;
unsigned short mode; /* See MPOL_* above */
unsigned short flags; /* See set_mempolicy() MPOL_F_* above */
nodemask_t nodes; /* interleave/bind/perfer */
@@ -94,7 +95,7 @@ static inline struct mempolicy *mpol_dup(struct mempolicy *pol)
static inline void mpol_get(struct mempolicy *pol)
{
if (pol)
- atomic_inc(&pol->refcnt);
+ refcount_inc(&pol->refcnt);
}
extern bool __mpol_equal(struct mempolicy *a, struct mempolicy *b);
diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index 1fafcc38acbad..b6032560ec2db 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -39,6 +39,13 @@ struct vmem_altmap {
* A more complete discussion of unaddressable memory may be found in
* include/linux/hmm.h and Documentation/vm/hmm.rst.
*
+ * MEMORY_DEVICE_COHERENT:
+ * Device memory that is cache coherent from device and CPU point of view. This
+ * is used on platforms that have an advanced system bus (like CAPI or CXL). A
+ * driver can hotplug the device memory using ZONE_DEVICE and with that memory
+ * type. Any page of a process can be migrated to such memory. However no one
+ * should be allowed to pin such memory so that it can always be evicted.
+ *
* MEMORY_DEVICE_FS_DAX:
* Host memory that has similar access semantics as System RAM i.e. DMA
* coherent and supports page pinning. In support of coordinating page
@@ -59,6 +66,7 @@ struct vmem_altmap {
enum memory_type {
/* 0 is reserved to catch uninitialized type fields */
MEMORY_DEVICE_PRIVATE = 1,
+ MEMORY_DEVICE_COHERENT,
MEMORY_DEVICE_FS_DAX,
MEMORY_DEVICE_GENERIC,
MEMORY_DEVICE_PCI_P2PDMA,
@@ -130,6 +138,7 @@ static inline unsigned long pgmap_vmemmap_nr(struct dev_pagemap *pgmap)
}
#ifdef CONFIG_ZONE_DEVICE
+bool pfn_zone_device_reserved(unsigned long pfn);
void *memremap_pages(struct dev_pagemap *pgmap, int nid);
void memunmap_pages(struct dev_pagemap *pgmap);
void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap);
@@ -142,6 +151,11 @@ unsigned long vmem_altmap_offset(struct vmem_altmap *altmap);
void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns);
unsigned long memremap_compat_align(void);
#else
+static inline bool pfn_zone_device_reserved(unsigned long pfn)
+{
+ return false;
+}
+
static inline void *devm_memremap_pages(struct device *dev,
struct dev_pagemap *pgmap)
{
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index db96e10eb8da2..66a34eae8cb63 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -130,6 +130,7 @@ static inline unsigned long migrate_pfn(unsigned long pfn)
enum migrate_vma_direction {
MIGRATE_VMA_SELECT_SYSTEM = 1 << 0,
MIGRATE_VMA_SELECT_DEVICE_PRIVATE = 1 << 1,
+ MIGRATE_VMA_SELECT_DEVICE_COHERENT = 1 << 2,
};
struct migrate_vma {
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 213cc569b1922..bf60d947503ef 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -820,16 +820,11 @@ static inline int page_mapcount(struct page *page)
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
int total_mapcount(struct page *page);
-int page_trans_huge_mapcount(struct page *page);
#else
static inline int total_mapcount(struct page *page)
{
return page_mapcount(page);
}
-static inline int page_trans_huge_mapcount(struct page *page)
-{
- return page_mapcount(page);
-}
#endif
static inline struct page *virt_to_head_page(const void *x)
@@ -1106,6 +1101,7 @@ static inline bool page_is_devmap_managed(struct page *page)
return false;
switch (page->pgmap->type) {
case MEMORY_DEVICE_PRIVATE:
+ case MEMORY_DEVICE_COHERENT:
case MEMORY_DEVICE_FS_DAX:
return true;
default:
@@ -1135,6 +1131,21 @@ static inline bool is_device_private_page(const struct page *page)
page->pgmap->type == MEMORY_DEVICE_PRIVATE;
}
+static inline bool is_device_coherent_page(const struct page *page)
+{
+ return IS_ENABLED(CONFIG_DEV_PAGEMAP_OPS) &&
+ is_zone_device_page(page) &&
+ page->pgmap->type == MEMORY_DEVICE_COHERENT;
+}
+
+static inline bool is_dev_private_or_coherent_page(const struct page *page)
+{
+ return IS_ENABLED(CONFIG_DEV_PAGEMAP_OPS) &&
+ is_zone_device_page(page) &&
+ (page->pgmap->type == MEMORY_DEVICE_PRIVATE ||
+ page->pgmap->type == MEMORY_DEVICE_COHERENT);
+}
+
static inline bool is_pci_p2pdma_page(const struct page *page)
{
return IS_ENABLED(CONFIG_DEV_PAGEMAP_OPS) &&
@@ -1916,10 +1927,6 @@ long get_user_pages(unsigned long start, unsigned long nr_pages,
long pin_user_pages(unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages,
struct vm_area_struct **vmas);
-long get_user_pages_locked(unsigned long start, unsigned long nr_pages,
- unsigned int gup_flags, struct page **pages, int *locked);
-long pin_user_pages_locked(unsigned long start, unsigned long nr_pages,
- unsigned int gup_flags, struct page **pages, int *locked);
long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
struct page **pages, unsigned int gup_flags);
long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
@@ -2925,13 +2932,11 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
#define FOLL_FORCE 0x10 /* get_user_pages read/write w/o permission */
#define FOLL_NOWAIT 0x20 /* if a disk transfer is needed, start the IO
* and return without waiting upon it */
-#define FOLL_POPULATE 0x40 /* fault in pages (with FOLL_MLOCK) */
#define FOLL_NOFAULT 0x80 /* do not fault in pages */
#define FOLL_HWPOISON 0x100 /* check page is hwpoisoned */
#define FOLL_NUMA 0x200 /* force NUMA hinting page fault */
#define FOLL_MIGRATION 0x400 /* wait for page to replace migration entry */
#define FOLL_TRIED 0x800 /* a retry, previous pass started an IO */
-#define FOLL_MLOCK 0x1000 /* lock present pages */
#define FOLL_REMOTE 0x2000 /* we are working on non-current tsk/mm */
#define FOLL_COW 0x4000 /* internal GUP flag */
#define FOLL_ANON 0x8000 /* don't do file mappings */
@@ -3151,10 +3156,12 @@ static inline void print_vma_addr(char *prefix, unsigned long rip)
}
#endif
+#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
int vmemmap_remap_free(unsigned long start, unsigned long end,
unsigned long reuse);
int vmemmap_remap_alloc(unsigned long start, unsigned long end,
unsigned long reuse, gfp_t gfp_mask);
+#endif
void *sparse_buffer_alloc(unsigned long size);
struct page * __populate_section_memmap(unsigned long pfn,
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index b725839dfe715..884d6f6af05bb 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -99,7 +99,8 @@ void lruvec_add_folio(struct lruvec *lruvec, struct folio *folio)
update_lru_size(lruvec, lru, folio_zonenum(folio),
folio_nr_pages(folio));
- list_add(&folio->lru, &lruvec->lists[lru]);
+ if (lru != LRU_UNEVICTABLE)
+ list_add(&folio->lru, &lruvec->lists[lru]);
}
static __always_inline void add_page_to_lru_list(struct page *page,
@@ -115,6 +116,7 @@ void lruvec_add_folio_tail(struct lruvec *lruvec, struct folio *folio)
update_lru_size(lruvec, lru, folio_zonenum(folio),
folio_nr_pages(folio));
+ /* This is not expected to be used on LRU_UNEVICTABLE */
list_add_tail(&folio->lru, &lruvec->lists[lru]);
}
@@ -127,8 +129,11 @@ static __always_inline void add_page_to_lru_list_tail(struct page *page,
static __always_inline
void lruvec_del_folio(struct lruvec *lruvec, struct folio *folio)
{
- list_del(&folio->lru);
- update_lru_size(lruvec, folio_lru_list(folio), folio_zonenum(folio),
+ enum lru_list lru = folio_lru_list(folio);
+
+ if (lru != LRU_UNEVICTABLE)
+ list_del(&folio->lru);
+ update_lru_size(lruvec, lru, folio_zonenum(folio),
-folio_nr_pages(folio));
}
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 5140e5feb4866..475bdb2827697 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -85,7 +85,16 @@ struct page {
* lruvec->lru_lock. Sometimes used as a generic list
* by the page owner.
*/
- struct list_head lru;
+ union {
+ struct list_head lru;
+ /* Or, for the Unevictable "LRU list" slot */
+ struct {
+ /* Always even, to negate PageTail */
+ void *__filler;
+ /* Count page's or folio's mlocks */
+ unsigned int mlock_count;
+ };
+ };
/* See page-flags.h for PAGE_MAPPING_FLAGS */
struct address_space *mapping;
pgoff_t index; /* Our offset within mapping. */
@@ -241,7 +250,13 @@ struct folio {
struct {
/* public: */
unsigned long flags;
- struct list_head lru;
+ union {
+ struct list_head lru;
+ struct {
+ void *__filler;
+ unsigned int mlock_count;
+ };
+ };
struct address_space *mapping;
pgoff_t index;
void *private;
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index aed44e9b5d899..3fff6deca2c08 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -83,6 +83,17 @@ static inline bool is_migrate_movable(int mt)
return is_migrate_cma(mt) || mt == MIGRATE_MOVABLE;
}
+/*
+ * Check whether a migratetype can be merged with another migratetype.
+ *
+ * It is only mergeable when it can fall back to other migratetypes for
+ * allocation. See fallbacks[MIGRATE_TYPES][3] in page_alloc.c.
+ */
+static inline bool migratetype_is_mergeable(int mt)
+{
+ return mt < MIGRATE_PCPTYPES;
+}
+
#define for_each_migratetype_order(order, type) \
for (order = 0; order < MAX_ORDER; order++) \
for (type = 0; type < MIGRATE_TYPES; type++)
@@ -211,6 +222,9 @@ enum node_stat_item {
#ifdef CONFIG_SWAP
NR_SWAPCACHE,
#endif
+#ifdef CONFIG_NUMA_BALANCING
+ PGPROMOTE_SUCCESS, /* promote successfully */
+#endif
NR_VM_NODE_STAT_ITEMS
};
@@ -920,12 +934,6 @@ typedef struct pglist_data {
#define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages)
#define node_spanned_pages(nid) (NODE_DATA(nid)->node_spanned_pages)
-#ifdef CONFIG_FLATMEM
-#define pgdat_page_nr(pgdat, pagenr) ((pgdat)->node_mem_map + (pagenr))
-#else
-#define pgdat_page_nr(pgdat, pagenr) pfn_to_page((pgdat)->node_start_pfn + (pagenr))
-#endif
-#define nid_page_nr(nid, pagenr) pgdat_page_nr(NODE_DATA(nid),(pagenr))
#define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn)
#define node_end_pfn(nid) pgdat_end_pfn(NODE_DATA(nid))
@@ -1101,7 +1109,6 @@ static inline struct pglist_data *NODE_DATA(int nid)
{
return &contig_page_data;
}
-#define NODE_MEM_MAP(nid) mem_map
#else /* CONFIG_NUMA */
@@ -1390,11 +1397,9 @@ static inline unsigned long *section_to_usemap(struct mem_section *ms)
static inline struct mem_section *__nr_to_section(unsigned long nr)
{
#ifdef CONFIG_SPARSEMEM_EXTREME
- if (!mem_section)
+ if (!*mem_section || !mem_section[SECTION_NR_TO_ROOT(nr)])
return NULL;
#endif
- if (!mem_section[SECTION_NR_TO_ROOT(nr)])
- return NULL;
return &mem_section[SECTION_NR_TO_ROOT(nr)][nr & SECTION_ROOT_MASK];
}
extern size_t mem_section_usage_size(void);
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 68f81d8d36def..4829e6869f2ad 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -570,7 +570,6 @@ extern void nfs_complete_unlink(struct dentry *dentry, struct inode *);
/*
* linux/fs/nfs/write.c
*/
-extern int nfs_congestion_kb;
extern int nfs_writepage(struct page *page, struct writeback_control *wbc);
extern int nfs_writepages(struct address_space *, struct writeback_control *);
extern int nfs_flush_incompatible(struct file *file, struct page *page);
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index ca0959e51e817..3444ebbc63b6c 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -137,7 +137,6 @@ struct nfs_server {
struct rpc_clnt * client_acl; /* ACL RPC client handle */
struct nlm_host *nlm_host; /* NLM client handle */
struct nfs_iostats __percpu *io_stats; /* I/O statistics */
- atomic_long_t writeback; /* number of writeback pages */
unsigned int flags; /* various flags */
/* The following are for internal use only. Also see uapi/linux/nfs_mount.h */
diff --git a/include/linux/node.h b/include/linux/node.h
index bb21fd631b162..7f876d48af11f 100644
--- a/include/linux/node.h
+++ b/include/linux/node.h
@@ -112,6 +112,7 @@ static inline void link_mem_sections(int nid, unsigned long start_pfn,
extern void unregister_node(struct node *node);
#ifdef CONFIG_NUMA
+extern void node_dev_init(void);
/* Core of the node registration - only memory hotplug should use this */
extern int __register_one_node(int nid);
@@ -149,6 +150,9 @@ extern void register_hugetlbfs_with_node(node_registration_func_t doregister,
node_registration_func_t unregister);
#endif
#else
+static inline void node_dev_init(void)
+{
+}
static inline int __register_one_node(int nid)
{
return 0;
@@ -181,4 +185,9 @@ static inline void register_hugetlbfs_with_node(node_registration_func_t reg,
#define to_node(device) container_of(device, struct node, dev)
+static inline bool node_is_toptier(int node)
+{
+ return node_state(node, N_CPU);
+}
+
#endif /* _LINUX_NODE_H_ */
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 1c3b6e5c8bfd3..340cb81565683 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -190,13 +190,81 @@ enum pageflags {
#ifndef __GENERATING_BOUNDS_H
+#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
+DECLARE_STATIC_KEY_MAYBE(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON,
+ hugetlb_free_vmemmap_enabled_key);
+
+static __always_inline bool hugetlb_free_vmemmap_enabled(void)
+{
+ return static_branch_maybe(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON,
+ &hugetlb_free_vmemmap_enabled_key);
+}
+
+/*
+ * If the feature of freeing some vmemmap pages associated with each HugeTLB
+ * page is enabled, the head vmemmap page frame is reused and all of the tail
+ * vmemmap addresses map to the head vmemmap page frame (furture details can
+ * refer to the figure at the head of the mm/hugetlb_vmemmap.c). In other
+ * words, there are more than one page struct with PG_head associated with each
+ * HugeTLB page. We __know__ that there is only one head page struct, the tail
+ * page structs with PG_head are fake head page structs. We need an approach
+ * to distinguish between those two different types of page structs so that
+ * compound_head() can return the real head page struct when the parameter is
+ * the tail page struct but with PG_head.
+ *
+ * The page_fixed_fake_head() returns the real head page struct if the @page is
+ * fake page head, otherwise, returns @page which can either be a true page
+ * head or tail.
+ */
+static __always_inline const struct page *page_fixed_fake_head(const struct page *page)
+{
+ if (!hugetlb_free_vmemmap_enabled())
+ return page;
+
+ /*
+ * Only addresses aligned with PAGE_SIZE of struct page may be fake head
+ * struct page. The alignment check aims to avoid access the fields (
+ * e.g. compound_head) of the @page[1]. It can avoid touch a (possibly)
+ * cold cacheline in some cases.
+ */
+ if (IS_ALIGNED((unsigned long)page, PAGE_SIZE) &&
+ test_bit(PG_head, &page->flags)) {
+ /*
+ * We can safely access the field of the @page[1] with PG_head
+ * because the @page is a compound page composed with at least
+ * two contiguous pages.
+ */
+ unsigned long head = READ_ONCE(page[1].compound_head);
+
+ if (likely(head & 1))
+ return (const struct page *)(head - 1);
+ }
+ return page;
+}
+#else
+static inline const struct page *page_fixed_fake_head(const struct page *page)
+{
+ return page;
+}
+
+static inline bool hugetlb_free_vmemmap_enabled(void)
+{
+ return false;
+}
+#endif
+
+static __always_inline int page_is_fake_head(struct page *page)
+{
+ return page_fixed_fake_head(page) != page;
+}
+
static inline unsigned long _compound_head(const struct page *page)
{
unsigned long head = READ_ONCE(page->compound_head);
if (unlikely(head & 1))
return head - 1;
- return (unsigned long)page;
+ return (unsigned long)page_fixed_fake_head(page);
}
#define compound_head(page) ((typeof(page))_compound_head(page))
@@ -231,12 +299,13 @@ static inline unsigned long _compound_head(const struct page *page)
static __always_inline int PageTail(struct page *page)
{
- return READ_ONCE(page->compound_head) & 1;
+ return READ_ONCE(page->compound_head) & 1 || page_is_fake_head(page);
}
static __always_inline int PageCompound(struct page *page)
{
- return test_bit(PG_head, &page->flags) || PageTail(page);
+ return test_bit(PG_head, &page->flags) ||
+ READ_ONCE(page->compound_head) & 1;
}
#define PAGE_POISON_PATTERN -1l
@@ -695,7 +764,20 @@ static inline bool test_set_page_writeback(struct page *page)
return set_page_writeback(page);
}
-__PAGEFLAG(Head, head, PF_ANY) CLEARPAGEFLAG(Head, head, PF_ANY)
+static __always_inline bool folio_test_head(struct folio *folio)
+{
+ return test_bit(PG_head, folio_flags(folio, FOLIO_PF_ANY));
+}
+
+static __always_inline int PageHead(struct page *page)
+{
+ PF_POISONED_CHECK(page);
+ return test_bit(PG_head, &page->flags) && !page_is_fake_head(page);
+}
+
+__SETPAGEFLAG(Head, head, PF_ANY)
+__CLEARPAGEFLAG(Head, head, PF_ANY)
+CLEARPAGEFLAG(Head, head, PF_ANY)
/**
* folio_test_large() - Does this folio contain more than one page?
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 270bf5136c34e..dc31eb981ea2b 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -594,13 +594,6 @@ static inline struct page *find_subpage(struct page *head, pgoff_t index)
unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start,
pgoff_t end, unsigned int nr_pages,
struct page **pages);
-static inline unsigned find_get_pages(struct address_space *mapping,
- pgoff_t *start, unsigned int nr_pages,
- struct page **pages)
-{
- return find_get_pages_range(mapping, start, (pgoff_t)-1, nr_pages,
- pages);
-}
unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t start,
unsigned int nr_pages, struct page **pages);
unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index,
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index e704b1a4c06c0..73cce292d32c0 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -12,6 +12,8 @@
#include <linux/memcontrol.h>
#include <linux/highmem.h>
+#include <linux/refcount.h>
+
/*
* The anon_vma heads a list of private "related" vmas, to scan if
* an anonymous page pointing to this anon_vma needs to be unmapped:
@@ -36,7 +38,7 @@ struct anon_vma {
* the reference is responsible for clearing up the
* anon_vma if they are the last user on release
*/
- atomic_t refcount;
+ refcount_t refcount;
/*
* Count of child anon_vmas and VMAs which points to this anon_vma.
@@ -100,14 +102,14 @@ enum ttu_flags {
#ifdef CONFIG_MMU
static inline void get_anon_vma(struct anon_vma *anon_vma)
{
- atomic_inc(&anon_vma->refcount);
+ refcount_inc(&anon_vma->refcount);
}
void __put_anon_vma(struct anon_vma *anon_vma);
static inline void put_anon_vma(struct anon_vma *anon_vma)
{
- if (atomic_dec_and_test(&anon_vma->refcount))
+ if (refcount_dec_and_test(&anon_vma->refcount))
__put_anon_vma(anon_vma);
}
@@ -167,18 +169,19 @@ struct anon_vma *page_get_anon_vma(struct page *page);
*/
void page_move_anon_rmap(struct page *, struct vm_area_struct *);
void page_add_anon_rmap(struct page *, struct vm_area_struct *,
- unsigned long, bool);
+ unsigned long address, bool compound);
void do_page_add_anon_rmap(struct page *, struct vm_area_struct *,
- unsigned long, int);
+ unsigned long address, int flags);
void page_add_new_anon_rmap(struct page *, struct vm_area_struct *,
- unsigned long, bool);
-void page_add_file_rmap(struct page *, bool);
-void page_remove_rmap(struct page *, bool);
-
+ unsigned long address, bool compound);
+void page_add_file_rmap(struct page *, struct vm_area_struct *,
+ bool compound);
+void page_remove_rmap(struct page *, struct vm_area_struct *,
+ bool compound);
void hugepage_add_anon_rmap(struct page *, struct vm_area_struct *,
- unsigned long);
+ unsigned long address);
void hugepage_add_new_anon_rmap(struct page *, struct vm_area_struct *,
- unsigned long);
+ unsigned long address);
static inline void page_dup_rmap(struct page *page, bool compound)
{
@@ -237,12 +240,6 @@ unsigned long page_address_in_vma(struct page *, struct vm_area_struct *);
*/
int folio_mkclean(struct folio *);
-/*
- * called in munlock()/munmap() path to check for other vmas holding
- * the page mlocked.
- */
-void page_mlock(struct page *page);
-
void remove_migration_ptes(struct page *old, struct page *new, bool locked);
/*
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 75ba8aa60248b..d7d232e7e654d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1065,6 +1065,7 @@ struct task_struct {
#ifdef CONFIG_DETECT_HUNG_TASK
unsigned long last_switch_count;
unsigned long last_switch_time;
+ unsigned long killed_time;
#endif
/* Filesystem information: */
struct fs_struct *fs;
@@ -1492,6 +1493,13 @@ struct task_struct {
struct callback_head l1d_flush_kill;
#endif
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ unsigned long getblk_stamp;
+ unsigned int getblk_executed;
+ unsigned int getblk_bh_count;
+ unsigned long getblk_bh_state;
+#endif
+
/*
* New fields for task_struct should be added above here, so that
* they are included in the randomized portion of task_struct.
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index 3f2b70f8d32ce..c1076b5e17fb1 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -23,6 +23,16 @@ enum sched_tunable_scaling {
SCHED_TUNABLESCALING_END,
};
+#define NUMA_BALANCING_DISABLED 0x0
+#define NUMA_BALANCING_NORMAL 0x1
+#define NUMA_BALANCING_MEMORY_TIERING 0x2
+
+#ifdef CONFIG_NUMA_BALANCING
+extern int sysctl_numa_balancing_mode;
+#else
+#define sysctl_numa_balancing_mode 0
+#endif
+
/*
* control realtime throttling:
*
diff --git a/include/linux/stddef.h b/include/linux/stddef.h
index ca507bd5f8082..929d67710cc51 100644
--- a/include/linux/stddef.h
+++ b/include/linux/stddef.h
@@ -13,11 +13,7 @@ enum {
};
#undef offsetof
-#ifdef __compiler_offsetof
-#define offsetof(TYPE, MEMBER) __compiler_offsetof(TYPE, MEMBER)
-#else
-#define offsetof(TYPE, MEMBER) ((size_t)&((TYPE *)0)->MEMBER)
-#endif
+#define offsetof(TYPE, MEMBER) __builtin_offsetof(TYPE, MEMBER)
/**
* sizeof_field() - Report the size of a struct field in bytes
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 1d38d9475c4d0..b546e4bd5c5a2 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -514,7 +514,6 @@ extern int __swp_swapcount(swp_entry_t entry);
extern int swp_swapcount(swp_entry_t entry);
extern struct swap_info_struct *page_swap_info(struct page *);
extern struct swap_info_struct *swp_swap_info(swp_entry_t entry);
-extern bool reuse_swap_page(struct page *);
extern int try_to_free_swap(struct page *);
struct backing_dev_info;
extern int init_swap_address_space(unsigned int type, unsigned long nr_pages);
@@ -680,9 +679,6 @@ static inline int swp_swapcount(swp_entry_t entry)
return 0;
}
-#define reuse_swap_page(page) \
- (page_trans_huge_mapcount(page) == 1)
-
static inline int try_to_free_swap(struct page *page)
{
return 0;
diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
index 73a6f34b3847a..1087d1e2be5c4 100644
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -209,10 +209,7 @@ __bad_copy_from(void);
extern void __compiletime_error("copy destination size is too small")
__bad_copy_to(void);
-static inline void copy_overflow(int size, unsigned long count)
-{
- WARN(1, "Buffer overflow detected (%d < %lu)!\n", size, count);
-}
+void copy_overflow(int size, unsigned long count);
static __always_inline __must_check bool
check_copy_size(const void *addr, size_t bytes, bool is_source)
diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
index ac0394087f7d4..bca27b4e5eb2d 100644
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@@ -401,8 +401,6 @@ static inline void user_access_restore(unsigned long flags) { }
#endif
#ifdef CONFIG_HARDENED_USERCOPY
-void usercopy_warn(const char *name, const char *detail, bool to_user,
- unsigned long offset, unsigned long len);
void __noreturn usercopy_abort(const char *name, const char *detail,
bool to_user, unsigned long offset,
unsigned long len);
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index 7b2363388bfa2..16a0a4fd000be 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -129,6 +129,9 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
#ifdef CONFIG_SWAP
SWAP_RA,
SWAP_RA_HIT,
+#ifdef CONFIG_KSM
+ KSM_SWPIN_COPY,
+#endif
#endif
#ifdef CONFIG_X86
DIRECT_MAP_LEVEL2_SPLIT,
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 880227b9f0440..7b879c77bec5f 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -35,17 +35,6 @@ struct notifier_block; /* in notifier.h */
#define VM_DEFER_KMEMLEAK 0
#endif
-/*
- * VM_KASAN is used slightly differently depending on CONFIG_KASAN_VMALLOC.
- *
- * If IS_ENABLED(CONFIG_KASAN_VMALLOC), VM_KASAN is set on a vm_struct after
- * shadow memory has been mapped. It's used to handle allocation errors so that
- * we don't try to poison shadow on free if it was never allocated.
- *
- * Otherwise, VM_KASAN is set for kasan_module_alloc() allocations and used to
- * determine which allocations need the module shadow freed.
- */
-
/* bits [20..32] reserved for arch specific ioremap internals */
/*
@@ -126,6 +115,13 @@ static inline int arch_vmap_pte_supported_shift(unsigned long size)
}
#endif
+#ifndef arch_vmap_pgprot_tagged
+static inline pgprot_t arch_vmap_pgprot_tagged(pgprot_t prot)
+{
+ return prot;
+}
+#endif
+
/*
* Highlevel APIs for driver use
*/
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index fec248ab1fec5..dc2b94e6a94f0 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -345,28 +345,13 @@ void wb_domain_exit(struct wb_domain *dom);
extern struct wb_domain global_wb_domain;
/* These are exported to sysctl. */
-extern int dirty_background_ratio;
-extern unsigned long dirty_background_bytes;
-extern int vm_dirty_ratio;
-extern unsigned long vm_dirty_bytes;
extern unsigned int dirty_writeback_interval;
extern unsigned int dirty_expire_interval;
extern unsigned int dirtytime_expire_interval;
-extern int vm_highmem_is_dirtyable;
extern int laptop_mode;
-int dirty_background_ratio_handler(struct ctl_table *table, int write,
- void *buffer, size_t *lenp, loff_t *ppos);
-int dirty_background_bytes_handler(struct ctl_table *table, int write,
- void *buffer, size_t *lenp, loff_t *ppos);
-int dirty_ratio_handler(struct ctl_table *table, int write,
- void *buffer, size_t *lenp, loff_t *ppos);
-int dirty_bytes_handler(struct ctl_table *table, int write,
- void *buffer, size_t *lenp, loff_t *ppos);
int dirtytime_interval_handler(struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos);
-int dirty_writeback_centisecs_handler(struct ctl_table *table, int write,
- void *buffer, size_t *lenp, loff_t *ppos);
void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty);
unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh);
diff --git a/include/trace/events/huge_memory.h b/include/trace/events/huge_memory.h
index 4fdb14a81108b..d651f3437367d 100644
--- a/include/trace/events/huge_memory.h
+++ b/include/trace/events/huge_memory.h
@@ -29,7 +29,6 @@
EM( SCAN_VMA_NULL, "vma_null") \
EM( SCAN_VMA_CHECK, "vma_check_failed") \
EM( SCAN_ADDRESS_RANGE, "not_suitable_address_range") \
- EM( SCAN_SWAP_CACHE_PAGE, "page_swap_cache") \
EM( SCAN_DEL_PAGE_LRU, "could_not_delete_page_from_lru")\
EM( SCAN_ALLOC_HUGE_PAGE_FAIL, "alloc_huge_page_failed") \
EM( SCAN_CGROUP_CHARGE_FAIL, "ccgroup_charge_failed") \
diff --git a/include/trace/events/migrate.h b/include/trace/events/migrate.h
index 779f3fad9ecd5..061b5128f335a 100644
--- a/include/trace/events/migrate.h
+++ b/include/trace/events/migrate.h
@@ -105,6 +105,37 @@ TRACE_EVENT(mm_migrate_pages_start,
__print_symbolic(__entry->reason, MIGRATE_REASON))
);
+DECLARE_EVENT_CLASS(migration_pte,
+
+ TP_PROTO(unsigned long addr, unsigned long pte, int order),
+
+ TP_ARGS(addr, pte, order),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, addr)
+ __field(unsigned long, pte)
+ __field(int, order)
+ ),
+
+ TP_fast_assign(
+ __entry->addr = addr;
+ __entry->pte = pte;
+ __entry->order = order;
+ ),
+
+ TP_printk("addr=%lx, pte=%lx order=%d", __entry->addr, __entry->pte, __entry->order)
+);
+
+DEFINE_EVENT(migration_pte, set_migration_pte,
+ TP_PROTO(unsigned long addr, unsigned long pte, int order),
+ TP_ARGS(addr, pte, order)
+);
+
+DEFINE_EVENT(migration_pte, remove_migration_pte,
+ TP_PROTO(unsigned long addr, unsigned long pte, int order),
+ TP_ARGS(addr, pte, order)
+);
+
#endif /* _TRACE_MIGRATE_H */
/* This part must be outside protection */
diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h
index 116ed4d5d0f88..0698c5d0f1947 100644
--- a/include/trace/events/mmflags.h
+++ b/include/trace/events/mmflags.h
@@ -29,7 +29,6 @@
{(unsigned long)__GFP_HIGHMEM, "__GFP_HIGHMEM"}, \
{(unsigned long)GFP_DMA32, "GFP_DMA32"}, \
{(unsigned long)__GFP_HIGH, "__GFP_HIGH"}, \
- {(unsigned long)__GFP_ATOMIC, "__GFP_ATOMIC"}, \
{(unsigned long)__GFP_IO, "__GFP_IO"}, \
{(unsigned long)__GFP_FS, "__GFP_FS"}, \
{(unsigned long)__GFP_NOWARN, "__GFP_NOWARN"}, \
@@ -49,12 +48,20 @@
{(unsigned long)__GFP_RECLAIM, "__GFP_RECLAIM"}, \
{(unsigned long)__GFP_DIRECT_RECLAIM, "__GFP_DIRECT_RECLAIM"},\
{(unsigned long)__GFP_KSWAPD_RECLAIM, "__GFP_KSWAPD_RECLAIM"},\
- {(unsigned long)__GFP_ZEROTAGS, "__GFP_ZEROTAGS"}, \
- {(unsigned long)__GFP_SKIP_KASAN_POISON,"__GFP_SKIP_KASAN_POISON"}\
+ {(unsigned long)__GFP_ZEROTAGS, "__GFP_ZEROTAGS"} \
+
+#ifdef CONFIG_KASAN_HW_TAGS
+#define __def_gfpflag_names_kasan , \
+ {(unsigned long)__GFP_SKIP_ZERO, "__GFP_SKIP_ZERO"}, \
+ {(unsigned long)__GFP_SKIP_KASAN_POISON, "__GFP_SKIP_KASAN_POISON"}, \
+ {(unsigned long)__GFP_SKIP_KASAN_UNPOISON, "__GFP_SKIP_KASAN_UNPOISON"}
+#else
+#define __def_gfpflag_names_kasan
+#endif
#define show_gfp_flags(flags) \
(flags) ? __print_flags(flags, "|", \
- __def_gfpflag_names \
+ __def_gfpflag_names __def_gfpflag_names_kasan \
) : "none"
#ifdef CONFIG_MMU
diff --git a/include/trace/events/thp.h b/include/trace/events/thp.h
index ca3f2767828a6..202b3e3e67ff2 100644
--- a/include/trace/events/thp.h
+++ b/include/trace/events/thp.h
@@ -48,6 +48,33 @@ TRACE_EVENT(hugepage_update,
TP_printk("hugepage update at addr 0x%lx and pte = 0x%lx clr = 0x%lx, set = 0x%lx", __entry->addr, __entry->pte, __entry->clr, __entry->set)
);
+DECLARE_EVENT_CLASS(migration_pmd,
+
+ TP_PROTO(unsigned long addr, unsigned long pmd),
+
+ TP_ARGS(addr, pmd),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, addr)
+ __field(unsigned long, pmd)
+ ),
+
+ TP_fast_assign(
+ __entry->addr = addr;
+ __entry->pmd = pmd;
+ ),
+ TP_printk("addr=%lx, pmd=%lx", __entry->addr, __entry->pmd)
+);
+
+DEFINE_EVENT(migration_pmd, set_migration_pmd,
+ TP_PROTO(unsigned long addr, unsigned long pmd),
+ TP_ARGS(addr, pmd)
+);
+
+DEFINE_EVENT(migration_pmd, remove_migration_pmd,
+ TP_PROTO(unsigned long addr, unsigned long pmd),
+ TP_ARGS(addr, pmd)
+);
#endif /* _TRACE_THP_H */
/* This part must be outside protection */
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
index a345b1e12daf3..86b2a82da546a 100644
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -735,34 +735,6 @@ TRACE_EVENT(writeback_sb_inodes_requeue,
)
);
-DECLARE_EVENT_CLASS(writeback_congest_waited_template,
-
- TP_PROTO(unsigned int usec_timeout, unsigned int usec_delayed),
-
- TP_ARGS(usec_timeout, usec_delayed),
-
- TP_STRUCT__entry(
- __field( unsigned int, usec_timeout )
- __field( unsigned int, usec_delayed )
- ),
-
- TP_fast_assign(
- __entry->usec_timeout = usec_timeout;
- __entry->usec_delayed = usec_delayed;
- ),
-
- TP_printk("usec_timeout=%u usec_delayed=%u",
- __entry->usec_timeout,
- __entry->usec_delayed)
-);
-
-DEFINE_EVENT(writeback_congest_waited_template, writeback_congestion_wait,
-
- TP_PROTO(unsigned int usec_timeout, unsigned int usec_delayed),
-
- TP_ARGS(usec_timeout, usec_delayed)
-);
-
DECLARE_EVENT_CLASS(writeback_single_inode_template,
TP_PROTO(struct inode *inode,
diff --git a/init/main.c b/init/main.c
index ada50f5a15e43..e2a046a199127 100644
--- a/init/main.c
+++ b/init/main.c
@@ -181,7 +181,7 @@ EXPORT_SYMBOL_GPL(static_key_initialized);
unsigned int reset_devices;
EXPORT_SYMBOL(reset_devices);
-static int __init set_reset_devices(char *str)
+static int __init set_reset_devices(char *str __always_unused)
{
reset_devices = 1;
return 1;
@@ -231,13 +231,13 @@ static bool __init obsolete_checksetup(char *line)
unsigned long loops_per_jiffy = (1<<12);
EXPORT_SYMBOL(loops_per_jiffy);
-static int __init debug_kernel(char *str)
+static int __init debug_kernel(char *str __always_unused)
{
console_loglevel = CONSOLE_LOGLEVEL_DEBUG;
return 0;
}
-static int __init quiet_kernel(char *str)
+static int __init quiet_kernel(char *str __always_unused)
{
console_loglevel = CONSOLE_LOGLEVEL_QUIET;
return 0;
@@ -474,7 +474,7 @@ static void __init setup_boot_config(void)
get_boot_config_from_initrd(NULL, NULL);
}
-static int __init warn_bootconfig(char *str)
+static int __init warn_bootconfig(char *str __always_unused)
{
pr_warn("WARNING: 'bootconfig' found on the kernel command line but CONFIG_BOOT_CONFIG is not set.\n");
return 0;
@@ -503,7 +503,8 @@ static void __init repair_env_string(char *param, char *val)
/* Anything after -- gets handed straight to init. */
static int __init set_init_arg(char *param, char *val,
- const char *unused, void *arg)
+ const char *unused __always_unused,
+ void *arg __always_unused)
{
unsigned int i;
@@ -528,7 +529,8 @@ static int __init set_init_arg(char *param, char *val,
* unused parameters (modprobe will find them in /proc/cmdline).
*/
static int __init unknown_bootoption(char *param, char *val,
- const char *unused, void *arg)
+ const char *unused __always_unused,
+ void *arg __always_unused)
{
size_t len = strlen(param);
@@ -728,7 +730,8 @@ noinline void __ref rest_init(void)
/* Check for early params. */
static int __init do_early_param(char *param, char *val,
- const char *unused, void *arg)
+ const char *unused __always_unused,
+ void *arg __always_unused)
{
const struct obs_kernel_param *p;
@@ -1354,8 +1357,10 @@ static const char *initcall_level_names[] __initdata = {
"late",
};
-static int __init ignore_unknown_bootoption(char *param, char *val,
- const char *unused, void *arg)
+static int __init ignore_unknown_bootoption(char *param __always_unused,
+ char *val __always_unused,
+ const char *unused __always_unused,
+ void *arg __always_unused)
{
return 0;
}
@@ -1492,7 +1497,7 @@ void __weak free_initmem(void)
free_initmem_default(POISON_FREE_INITMEM);
}
-static int __ref kernel_init(void *unused)
+static int __ref kernel_init(void *unused __always_unused)
{
int ret;
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index 5becca9be867c..089c34d0732cf 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -45,6 +45,7 @@
struct mqueue_fs_context {
struct ipc_namespace *ipc_ns;
+ bool newns; /* Set if newly created ipc namespace */
};
#define MQUEUE_MAGIC 0x19800202
@@ -427,6 +428,14 @@ static int mqueue_get_tree(struct fs_context *fc)
{
struct mqueue_fs_context *ctx = fc->fs_private;
+ /*
+ * With a newly created ipc namespace, we don't need to do a search
+ * for an ipc namespace match, but we still need to set s_fs_info.
+ */
+ if (ctx->newns) {
+ fc->s_fs_info = ctx->ipc_ns;
+ return get_tree_nodev(fc, mqueue_fill_super);
+ }
return get_tree_keyed(fc, mqueue_fill_super, ctx->ipc_ns);
}
@@ -454,6 +463,10 @@ static int mqueue_init_fs_context(struct fs_context *fc)
return 0;
}
+/*
+ * mq_init_ns() is currently the only caller of mq_create_mount().
+ * So the ns parameter is always a newly created ipc namespace.
+ */
static struct vfsmount *mq_create_mount(struct ipc_namespace *ns)
{
struct mqueue_fs_context *ctx;
@@ -465,6 +478,7 @@ static struct vfsmount *mq_create_mount(struct ipc_namespace *ns)
return ERR_CAST(fc);
ctx = fc->fs_private;
+ ctx->newns = true;
put_ipc_ns(ctx->ipc_ns);
ctx->ipc_ns = get_ipc_ns(ns);
put_user_ns(fc->user_ns);
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 6357c3580d07b..eed2f7437d963 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -173,7 +173,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
return err;
}
- /* For try_to_free_swap() and munlock_vma_page() below */
+ /* For try_to_free_swap() below */
lock_page(old_page);
mmu_notifier_invalidate_range_start(&range);
@@ -201,13 +201,10 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
set_pte_at_notify(mm, addr, pvmw.pte,
mk_pte(new_page, vma->vm_page_prot));
- page_remove_rmap(old_page, false);
+ page_remove_rmap(old_page, vma, false);
if (!page_mapped(old_page))
try_to_free_swap(old_page);
page_vma_mapped_walk_done(&pvmw);
-
- if ((vma->vm_flags & VM_LOCKED) && !PageCompound(old_page))
- munlock_vma_page(old_page);
put_page(old_page);
err = 0;
diff --git a/kernel/fork.c b/kernel/fork.c
index d75a528f7b219..57d624f05182e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -254,6 +254,7 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
* so cache the vm_struct.
*/
if (stack) {
+ stack = kasan_reset_tag(stack);
tsk->stack_vm_area = find_vm_area(stack);
tsk->stack = stack;
}
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 52501e5f76554..40220dfd6fa93 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -147,6 +147,47 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
touch_nmi_watchdog();
}
+static void check_killed_task(struct task_struct *t, unsigned long timeout)
+{
+ unsigned long stamp = t->killed_time;
+
+ /*
+ * Ensure the task is not frozen.
+ * Also, skip vfork and any other user process that freezer should skip.
+ */
+ if (unlikely(t->flags & (PF_FROZEN | PF_FREEZER_SKIP)))
+ return;
+ /*
+ * Skip threads which are already inside do_exit(), for exit_mm() etc.
+ * might take many seconds.
+ */
+ if (t->flags & PF_EXITING)
+ return;
+ if (!stamp) {
+ stamp = jiffies;
+ if (!stamp)
+ stamp++;
+ t->killed_time = stamp;
+ return;
+ }
+ if (time_is_after_jiffies(stamp + timeout * HZ))
+ return;
+ trace_sched_process_hang(t);
+ if (sysctl_hung_task_panic) {
+ console_verbose();
+ hung_task_call_panic = true;
+ }
+ /*
+ * This thread failed to terminate for more than
+ * sysctl_hung_task_timeout_secs seconds, complain:
+ */
+ pr_err("INFO: task %s:%d can't die for more than %ld seconds.\n",
+ t->comm, t->pid, (jiffies - stamp) / HZ);
+ sched_show_task(t);
+ hung_task_show_lock = true;
+ touch_nmi_watchdog();
+}
+
/*
* To avoid extending the RCU grace period for an unbounded amount of time,
* periodically exit the critical section and enter a new one.
@@ -198,6 +239,9 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
goto unlock;
last_break = jiffies;
}
+ /* Check threads which are about to terminate. */
+ if (unlikely(fatal_signal_pending(t)))
+ check_killed_task(t, timeout);
/* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */
if (READ_ONCE(t->__state) == TASK_UNINTERRUPTIBLE)
check_hung_task(t, timeout);
diff --git a/kernel/kcov.c b/kernel/kcov.c
index 36ca640c4f8e7..475524bd900ab 100644
--- a/kernel/kcov.c
+++ b/kernel/kcov.c
@@ -459,37 +459,28 @@ void kcov_task_exit(struct task_struct *t)
static int kcov_mmap(struct file *filep, struct vm_area_struct *vma)
{
int res = 0;
- void *area;
struct kcov *kcov = vma->vm_file->private_data;
unsigned long size, off;
struct page *page;
unsigned long flags;
- area = vmalloc_user(vma->vm_end - vma->vm_start);
- if (!area)
- return -ENOMEM;
-
spin_lock_irqsave(&kcov->lock, flags);
size = kcov->size * sizeof(unsigned long);
- if (kcov->mode != KCOV_MODE_INIT || vma->vm_pgoff != 0 ||
+ if (kcov->area == NULL || vma->vm_pgoff != 0 ||
vma->vm_end - vma->vm_start != size) {
res = -EINVAL;
goto exit;
}
- if (!kcov->area) {
- kcov->area = area;
- vma->vm_flags |= VM_DONTEXPAND;
- spin_unlock_irqrestore(&kcov->lock, flags);
- for (off = 0; off < size; off += PAGE_SIZE) {
- page = vmalloc_to_page(kcov->area + off);
- if (vm_insert_page(vma, vma->vm_start + off, page))
- WARN_ONCE(1, "vm_insert_page() failed");
- }
- return 0;
+ spin_unlock_irqrestore(&kcov->lock, flags);
+ vma->vm_flags |= VM_DONTEXPAND;
+ for (off = 0; off < size; off += PAGE_SIZE) {
+ page = vmalloc_to_page(kcov->area + off);
+ if (vm_insert_page(vma, vma->vm_start + off, page))
+ WARN_ONCE(1, "vm_insert_page() failed");
}
+ return 0;
exit:
spin_unlock_irqrestore(&kcov->lock, flags);
- vfree(area);
return res;
}
@@ -564,31 +555,12 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd,
unsigned long arg)
{
struct task_struct *t;
- unsigned long size, unused;
+ unsigned long flags, unused;
int mode, i;
struct kcov_remote_arg *remote_arg;
struct kcov_remote *remote;
- unsigned long flags;
switch (cmd) {
- case KCOV_INIT_TRACE:
- /*
- * Enable kcov in trace mode and setup buffer size.
- * Must happen before anything else.
- */
- if (kcov->mode != KCOV_MODE_DISABLED)
- return -EBUSY;
- /*
- * Size must be at least 2 to hold current position and one PC.
- * Later we allocate size * sizeof(unsigned long) memory,
- * that must not overflow.
- */
- size = arg;
- if (size < 2 || size > INT_MAX / sizeof(unsigned long))
- return -EINVAL;
- kcov->size = size;
- kcov->mode = KCOV_MODE_INIT;
- return 0;
case KCOV_ENABLE:
/*
* Enable coverage for the current task.
@@ -692,9 +664,37 @@ static long kcov_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
struct kcov_remote_arg *remote_arg = NULL;
unsigned int remote_num_handles;
unsigned long remote_arg_size;
- unsigned long flags;
+ unsigned long size, flags;
+ void *area;
- if (cmd == KCOV_REMOTE_ENABLE) {
+ kcov = filep->private_data;
+ switch (cmd) {
+ case KCOV_INIT_TRACE:
+ /*
+ * Enable kcov in trace mode and setup buffer size.
+ * Must happen before anything else.
+ *
+ * First check the size argument - it must be at least 2
+ * to hold the current position and one PC.
+ */
+ size = arg;
+ if (size < 2 || size > INT_MAX / sizeof(unsigned long))
+ return -EINVAL;
+ area = vmalloc_user(size * sizeof(unsigned long));
+ if (area == NULL)
+ return -ENOMEM;
+ spin_lock_irqsave(&kcov->lock, flags);
+ if (kcov->mode != KCOV_MODE_DISABLED) {
+ spin_unlock_irqrestore(&kcov->lock, flags);
+ vfree(area);
+ return -EBUSY;
+ }
+ kcov->area = area;
+ kcov->size = size;
+ kcov->mode = KCOV_MODE_INIT;
+ spin_unlock_irqrestore(&kcov->lock, flags);
+ return 0;
+ case KCOV_REMOTE_ENABLE:
if (get_user(remote_num_handles, (unsigned __user *)(arg +
offsetof(struct kcov_remote_arg, num_handles))))
return -EFAULT;
@@ -710,16 +710,18 @@ static long kcov_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
return -EINVAL;
}
arg = (unsigned long)remote_arg;
+ fallthrough;
+ default:
+ /*
+ * All other commands can be normally executed under a spin lock, so we
+ * obtain and release it here in order to simplify kcov_ioctl_locked().
+ */
+ spin_lock_irqsave(&kcov->lock, flags);
+ res = kcov_ioctl_locked(kcov, cmd, arg);
+ spin_unlock_irqrestore(&kcov->lock, flags);
+ kfree(remote_arg);
+ return res;
}
-
- kcov = filep->private_data;
- spin_lock_irqsave(&kcov->lock, flags);
- res = kcov_ioctl_locked(kcov, cmd, arg);
- spin_unlock_irqrestore(&kcov->lock, flags);
-
- kfree(remote_arg);
-
- return res;
}
static const struct file_operations kcov_fops = {
diff --git a/kernel/panic.c b/kernel/panic.c
index 55b50e052ec3a..3c3fb36d8d414 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -66,6 +66,7 @@ EXPORT_SYMBOL_GPL(panic_timeout);
#define PANIC_PRINT_LOCK_INFO 0x00000008
#define PANIC_PRINT_FTRACE_INFO 0x00000010
#define PANIC_PRINT_ALL_PRINTK_MSG 0x00000020
+#define PANIC_PRINT_ALL_CPU_BT 0x00000040
unsigned long panic_print;
ATOMIC_NOTIFIER_HEAD(panic_notifier_list);
@@ -152,6 +153,9 @@ static void panic_print_sys_info(void)
if (panic_print & PANIC_PRINT_ALL_PRINTK_MSG)
console_flush_on_panic(CONSOLE_REPLAY_ALL);
+ if (panic_print & PANIC_PRINT_ALL_CPU_BT)
+ trigger_all_cpu_backtrace();
+
if (panic_print & PANIC_PRINT_TASK_INFO)
show_state();
@@ -185,6 +189,16 @@ void panic(const char *fmt, ...)
int old_cpu, this_cpu;
bool _crash_kexec_post_notifiers = crash_kexec_post_notifiers;
+ if (panic_on_warn) {
+ /*
+ * This thread may hit another WARN() in the panic path.
+ * Resetting this prevents additional WARN() from panicking the
+ * system on this thread. Other threads are blocked by the
+ * panic_mutex in panic().
+ */
+ panic_on_warn = 0;
+ }
+
/*
* Disable local interrupts. This will prevent panic_smp_self_stop
* from deadlocking the first cpu that invokes the panic, since
@@ -576,16 +590,8 @@ void __warn(const char *file, int line, void *caller, unsigned taint,
if (regs)
show_regs(regs);
- if (panic_on_warn) {
- /*
- * This thread may hit another WARN() in the panic path.
- * Resetting this prevents additional WARN() from panicking the
- * system on this thread. Other threads are blocked by the
- * panic_mutex in panic().
- */
- panic_on_warn = 0;
+ if (panic_on_warn)
panic("panic_on_warn set ...\n");
- }
if (!regs)
dump_stack();
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a506685e878aa..28d1b7af03dcb 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4280,7 +4280,9 @@ DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
#ifdef CONFIG_NUMA_BALANCING
-void set_numabalancing_state(bool enabled)
+int sysctl_numa_balancing_mode;
+
+static void __set_numabalancing_state(bool enabled)
{
if (enabled)
static_branch_enable(&sched_numa_balancing);
@@ -4288,13 +4290,22 @@ void set_numabalancing_state(bool enabled)
static_branch_disable(&sched_numa_balancing);
}
+void set_numabalancing_state(bool enabled)
+{
+ if (enabled)
+ sysctl_numa_balancing_mode = NUMA_BALANCING_NORMAL;
+ else
+ sysctl_numa_balancing_mode = NUMA_BALANCING_DISABLED;
+ __set_numabalancing_state(enabled);
+}
+
#ifdef CONFIG_PROC_SYSCTL
int sysctl_numa_balancing(struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
struct ctl_table t;
int err;
- int state = static_branch_likely(&sched_numa_balancing);
+ int state = sysctl_numa_balancing_mode;
if (write && !capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -4304,8 +4315,10 @@ int sysctl_numa_balancing(struct ctl_table *table, int write,
err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
if (err < 0)
return err;
- if (write)
- set_numabalancing_state(state);
+ if (write) {
+ sysctl_numa_balancing_mode = state;
+ __set_numabalancing_state(state);
+ }
return err;
}
#endif
diff --git a/kernel/scs.c b/kernel/scs.c
index 579841be88646..b83bc9251f996 100644
--- a/kernel/scs.c
+++ b/kernel/scs.c
@@ -32,7 +32,7 @@ static void *__scs_alloc(int node)
for (i = 0; i < NR_CACHED_SCS; i++) {
s = this_cpu_xchg(scs_cache[i], NULL);
if (s) {
- kasan_unpoison_vmalloc(s, SCS_SIZE);
+ kasan_unpoison_vmalloc(s, SCS_SIZE, KASAN_VMALLOC_NONE);
memset(s, 0, SCS_SIZE);
return s;
}
@@ -78,7 +78,7 @@ void scs_free(void *s)
if (this_cpu_cmpxchg(scs_cache[i], 0, s) == NULL)
return;
- kasan_unpoison_vmalloc(s, SCS_SIZE);
+ kasan_unpoison_vmalloc(s, SCS_SIZE, KASAN_VMALLOC_NONE);
vfree_atomic(s);
}
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 6a494ab55beaf..788b9a34d5ab4 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -100,8 +100,6 @@
static const int six_hundred_forty_kb = 640 * 1024;
#endif
-/* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */
-static const unsigned long dirty_bytes_min = 2 * PAGE_SIZE;
static const int ngroups_max = NGROUPS_MAX;
static const int cap_last_cap = CAP_LAST_CAP;
@@ -1698,7 +1696,7 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = sysctl_numa_balancing,
.extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE,
+ .extra2 = SYSCTL_FOUR,
},
#endif /* CONFIG_NUMA_BALANCING */
{
@@ -2400,55 +2398,6 @@ static struct ctl_table vm_table[] = {
.extra1 = SYSCTL_ZERO,
},
{
- .procname = "dirty_background_ratio",
- .data = &dirty_background_ratio,
- .maxlen = sizeof(dirty_background_ratio),
- .mode = 0644,
- .proc_handler = dirty_background_ratio_handler,
- .extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE_HUNDRED,
- },
- {
- .procname = "dirty_background_bytes",
- .data = &dirty_background_bytes,
- .maxlen = sizeof(dirty_background_bytes),
- .mode = 0644,
- .proc_handler = dirty_background_bytes_handler,
- .extra1 = SYSCTL_LONG_ONE,
- },
- {
- .procname = "dirty_ratio",
- .data = &vm_dirty_ratio,
- .maxlen = sizeof(vm_dirty_ratio),
- .mode = 0644,
- .proc_handler = dirty_ratio_handler,
- .extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE_HUNDRED,
- },
- {
- .procname = "dirty_bytes",
- .data = &vm_dirty_bytes,
- .maxlen = sizeof(vm_dirty_bytes),
- .mode = 0644,
- .proc_handler = dirty_bytes_handler,
- .extra1 = (void *)&dirty_bytes_min,
- },
- {
- .procname = "dirty_writeback_centisecs",
- .data = &dirty_writeback_interval,
- .maxlen = sizeof(dirty_writeback_interval),
- .mode = 0644,
- .proc_handler = dirty_writeback_centisecs_handler,
- },
- {
- .procname = "dirty_expire_centisecs",
- .data = &dirty_expire_interval,
- .maxlen = sizeof(dirty_expire_interval),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = SYSCTL_ZERO,
- },
- {
.procname = "dirtytime_expire_seconds",
.data = &dirtytime_expire_interval,
.maxlen = sizeof(dirtytime_expire_interval),
@@ -2620,13 +2569,6 @@ static struct ctl_table vm_table[] = {
},
#endif
{
- .procname = "laptop_mode",
- .data = &laptop_mode,
- .maxlen = sizeof(laptop_mode),
- .mode = 0644,
- .proc_handler = proc_dointvec_jiffies,
- },
- {
.procname = "vfs_cache_pressure",
.data = &sysctl_vfs_cache_pressure,
.maxlen = sizeof(sysctl_vfs_cache_pressure),
@@ -2723,17 +2665,6 @@ static struct ctl_table vm_table[] = {
.extra1 = SYSCTL_ZERO,
},
#endif
-#ifdef CONFIG_HIGHMEM
- {
- .procname = "highmem_is_dirtyable",
- .data = &vm_highmem_is_dirtyable,
- .maxlen = sizeof(vm_highmem_is_dirtyable),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE,
- },
-#endif
#ifdef CONFIG_MEMORY_FAILURE
{
.procname = "memory_failure_early_kill",
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 6613bcf460c69..fa933475826f5 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -208,20 +208,87 @@ config DEBUG_BUGVERBOSE
endmenu # "printk and dmesg options"
+config DEBUG_KERNEL
+ bool "Kernel debugging"
+ help
+ Say Y here if you are developing drivers or trying to debug and
+ identify kernel problems.
+
+config DEBUG_MISC
+ bool "Miscellaneous debug code"
+ default DEBUG_KERNEL
+ depends on DEBUG_KERNEL
+ help
+ Say Y here if you need to enable miscellaneous debug code that should
+ be under a more specific debug option but isn't.
+
menu "Compile-time checks and compiler options"
config DEBUG_INFO
- bool "Compile the kernel with debug info"
- depends on DEBUG_KERNEL && !COMPILE_TEST
+ bool
help
- If you say Y here the resulting kernel image will include
- debugging info resulting in a larger kernel image.
+ A kernel debug info option other than "None" has been selected
+ in the "Debug information" choice below, indicating that debug
+ information will be generated for build targets.
+
+choice
+ prompt "Debug information"
+ depends on DEBUG_KERNEL
+ help
+ Selecting something other than "None" results in a kernel image
+ that will include debugging info resulting in a larger kernel image.
This adds debug symbols to the kernel and modules (gcc -g), and
is needed if you intend to use kernel crashdump or binary object
tools like crash, kgdb, LKCD, gdb, etc on the kernel.
- Say Y here only if you plan to debug the kernel.
- If unsure, say N.
+ Choose which version of DWARF debug info to emit. If unsure,
+ select "Toolchain default".
+
+config DEBUG_INFO_NONE
+ bool "Disable debug information"
+ help
+ Do not build the kernel with debugging information, which will
+ result in a faster and smaller build.
+
+config DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT
+ bool "Rely on the toolchain's implicit default DWARF version"
+ select DEBUG_INFO
+ help
+ The implicit default version of DWARF debug info produced by a
+ toolchain changes over time.
+
+ This can break consumers of the debug info that haven't upgraded to
+ support newer revisions, and prevent testing newer versions, but
+ those should be less common scenarios.
+
+config DEBUG_INFO_DWARF4
+ bool "Generate DWARF Version 4 debuginfo"
+ select DEBUG_INFO
+ help
+ Generate DWARF v4 debug info. This requires gcc 4.5+ and gdb 7.0+.
+
+ If you have consumers of DWARF debug info that are not ready for
+ newer revisions of DWARF, you may wish to choose this or have your
+ config select this.
+
+config DEBUG_INFO_DWARF5
+ bool "Generate DWARF Version 5 debuginfo"
+ select DEBUG_INFO
+ depends on !CC_IS_CLANG || (CC_IS_CLANG && (AS_IS_LLVM || (AS_IS_GNU && AS_VERSION >= 23502)))
+ help
+ Generate DWARF v5 debug info. Requires binutils 2.35.2, gcc 5.0+ (gcc
+ 5.0+ accepts the -gdwarf-5 flag but only had partial support for some
+ draft features until 7.0), and gdb 8.0+.
+
+ Changes to the structure of debug info in Version 5 allow for around
+ 15-18% savings in resulting image and debug info section sizes as
+ compared to DWARF Version 4. DWARF Version 5 standardizes previous
+ extensions such as accelerators for symbol indexing and the format
+ for fission (.dwo/.dwp) files. Users may not want to select this
+ config if they rely on tooling that has not yet been updated to
+ support DWARF Version 5.
+
+endchoice # "Debug information"
if DEBUG_INFO
@@ -267,56 +334,12 @@ config DEBUG_INFO_SPLIT
to know about the .dwo files and include them.
Incompatible with older versions of ccache.
-choice
- prompt "DWARF version"
- help
- Which version of DWARF debug info to emit.
-
-config DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT
- bool "Rely on the toolchain's implicit default DWARF version"
- help
- The implicit default version of DWARF debug info produced by a
- toolchain changes over time.
-
- This can break consumers of the debug info that haven't upgraded to
- support newer revisions, and prevent testing newer versions, but
- those should be less common scenarios.
-
- If unsure, say Y.
-
-config DEBUG_INFO_DWARF4
- bool "Generate DWARF Version 4 debuginfo"
- help
- Generate DWARF v4 debug info. This requires gcc 4.5+ and gdb 7.0+.
-
- If you have consumers of DWARF debug info that are not ready for
- newer revisions of DWARF, you may wish to choose this or have your
- config select this.
-
-config DEBUG_INFO_DWARF5
- bool "Generate DWARF Version 5 debuginfo"
- depends on !CC_IS_CLANG || (CC_IS_CLANG && (AS_IS_LLVM || (AS_IS_GNU && AS_VERSION >= 23502)))
- depends on !DEBUG_INFO_BTF || PAHOLE_VERSION >= 121
- help
- Generate DWARF v5 debug info. Requires binutils 2.35.2, gcc 5.0+ (gcc
- 5.0+ accepts the -gdwarf-5 flag but only had partial support for some
- draft features until 7.0), and gdb 8.0+.
-
- Changes to the structure of debug info in Version 5 allow for around
- 15-18% savings in resulting image and debug info section sizes as
- compared to DWARF Version 4. DWARF Version 5 standardizes previous
- extensions such as accelerators for symbol indexing and the format
- for fission (.dwo/.dwp) files. Users may not want to select this
- config if they rely on tooling that has not yet been updated to
- support DWARF Version 5.
-
-endchoice # "DWARF version"
-
config DEBUG_INFO_BTF
bool "Generate BTF typeinfo"
depends on !DEBUG_INFO_SPLIT && !DEBUG_INFO_REDUCED
depends on !GCC_PLUGIN_RANDSTRUCT || COMPILE_TEST
depends on BPF_SYSCALL
+ depends on !DEBUG_INFO_DWARF5 || PAHOLE_VERSION >= 121
help
Generate deduplicated BTF type information from DWARF debug info.
Turning this on expects presence of pahole tool, which will convert
@@ -593,20 +616,6 @@ source "lib/Kconfig.kcsan"
endmenu
-config DEBUG_KERNEL
- bool "Kernel debugging"
- help
- Say Y here if you are developing drivers or trying to debug and
- identify kernel problems.
-
-config DEBUG_MISC
- bool "Miscellaneous debug code"
- default DEBUG_KERNEL
- depends on DEBUG_KERNEL
- help
- Say Y here if you need to enable miscellaneous debug code that should
- be under a more specific debug option but isn't.
-
menu "Networking Debugging"
source "net/Kconfig.debug"
@@ -1797,6 +1806,12 @@ config IO_STRICT_DEVMEM
menu "$(SRCARCH) Debugging"
+config DEBUG_AID_FOR_SYZBOT
+ bool "Additional debug code for syzbot"
+ default n
+ help
+ This option is intended for testing by syzbot.
+
source "arch/$(SRCARCH)/Kconfig.debug"
endmenu
diff --git a/lib/Kconfig.kasan b/lib/Kconfig.kasan
index 879757b6dd149..1f3e620188a24 100644
--- a/lib/Kconfig.kasan
+++ b/lib/Kconfig.kasan
@@ -178,17 +178,17 @@ config KASAN_TAGS_IDENTIFY
memory consumption.
config KASAN_VMALLOC
- bool "Back mappings in vmalloc space with real shadow memory"
- depends on KASAN_GENERIC && HAVE_ARCH_KASAN_VMALLOC
+ bool "Check accesses to vmalloc allocations"
+ depends on HAVE_ARCH_KASAN_VMALLOC
help
- By default, the shadow region for vmalloc space is the read-only
- zero page. This means that KASAN cannot detect errors involving
- vmalloc space.
-
- Enabling this option will hook in to vmap/vmalloc and back those
- mappings with real shadow memory allocated on demand. This allows
- for KASAN to detect more sorts of errors (and to support vmapped
- stacks), but at the cost of higher memory usage.
+ This mode makes KASAN check accesses to vmalloc allocations for
+ validity.
+
+ With software KASAN modes, checking is done for all types of vmalloc
+ allocations. Enabling this option leads to higher memory usage.
+
+ With hardware tag-based KASAN, only VM_ALLOC mappings are checked.
+ There is no additional memory usage.
config KASAN_KUNIT_TEST
tristate "KUnit-compatible tests of KASAN bug detection capabilities" if !KUNIT_ALL_TESTS
diff --git a/lib/Kconfig.kcsan b/lib/Kconfig.kcsan
index 63b70b8c55519..de022445fbba5 100644
--- a/lib/Kconfig.kcsan
+++ b/lib/Kconfig.kcsan
@@ -10,21 +10,10 @@ config HAVE_KCSAN_COMPILER
For the list of compilers that support KCSAN, please see
<file:Documentation/dev-tools/kcsan.rst>.
-config KCSAN_KCOV_BROKEN
- def_bool KCOV && CC_HAS_SANCOV_TRACE_PC
- depends on CC_IS_CLANG
- depends on !$(cc-option,-Werror=unused-command-line-argument -fsanitize=thread -fsanitize-coverage=trace-pc)
- help
- Some versions of clang support either KCSAN and KCOV but not the
- combination of the two.
- See https://bugs.llvm.org/show_bug.cgi?id=45831 for the status
- in newer releases.
-
menuconfig KCSAN
bool "KCSAN: dynamic data race detector"
depends on HAVE_ARCH_KCSAN && HAVE_KCSAN_COMPILER
depends on DEBUG_KERNEL && !KASAN
- depends on !KCSAN_KCOV_BROKEN
select STACKTRACE
help
The Kernel Concurrency Sanitizer (KCSAN) is a dynamic
diff --git a/lib/Kconfig.ubsan b/lib/Kconfig.ubsan
index 236c5cefc4cc5..f3c57ed518381 100644
--- a/lib/Kconfig.ubsan
+++ b/lib/Kconfig.ubsan
@@ -27,16 +27,6 @@ config UBSAN_TRAP
the system. For some system builders this is an acceptable
trade-off.
-config UBSAN_KCOV_BROKEN
- def_bool KCOV && CC_HAS_SANCOV_TRACE_PC
- depends on CC_IS_CLANG
- depends on !$(cc-option,-Werror=unused-command-line-argument -fsanitize=bounds -fsanitize-coverage=trace-pc)
- help
- Some versions of clang support either UBSAN or KCOV but not the
- combination of the two.
- See https://bugs.llvm.org/show_bug.cgi?id=45831 for the status
- in newer releases.
-
config CC_HAS_UBSAN_BOUNDS
def_bool $(cc-option,-fsanitize=bounds)
@@ -46,7 +36,6 @@ config CC_HAS_UBSAN_ARRAY_BOUNDS
config UBSAN_BOUNDS
bool "Perform array index bounds checking"
default UBSAN
- depends on !UBSAN_KCOV_BROKEN
depends on CC_HAS_UBSAN_ARRAY_BOUNDS || CC_HAS_UBSAN_BOUNDS
help
This option enables detection of directly indexed out of bounds
@@ -72,7 +61,6 @@ config UBSAN_ARRAY_BOUNDS
config UBSAN_LOCAL_BOUNDS
bool "Perform array local bounds checking"
depends on UBSAN_TRAP
- depends on !UBSAN_KCOV_BROKEN
depends on $(cc-option,-fsanitize=local-bounds)
help
This option enables -fsanitize=local-bounds which traps when an
diff --git a/lib/lz4/lz4_decompress.c b/lib/lz4/lz4_decompress.c
index 926f4823d5eac..fd1728d94babb 100644
--- a/lib/lz4/lz4_decompress.c
+++ b/lib/lz4/lz4_decompress.c
@@ -271,8 +271,12 @@ static FORCE_INLINE int LZ4_decompress_generic(
ip += length;
op += length;
- /* Necessarily EOF, due to parsing restrictions */
- if (!partialDecoding || (cpy == oend))
+ /* Necessarily EOF when !partialDecoding.
+ * When partialDecoding, it is EOF if we've either
+ * filled the output buffer or
+ * can't proceed with reading an offset for following match.
+ */
+ if (!partialDecoding || (cpy == oend) || (ip >= (iend - 2)))
break;
} else {
/* may overwrite up to WILDCOPYLENGTH beyond cpy */
diff --git a/lib/test_hmm.c b/lib/test_hmm.c
index 767538089a62e..dedce7908ac63 100644
--- a/lib/test_hmm.c
+++ b/lib/test_hmm.c
@@ -29,11 +29,32 @@
#include "test_hmm_uapi.h"
-#define DMIRROR_NDEVICES 2
+#define DMIRROR_NDEVICES 4
#define DMIRROR_RANGE_FAULT_TIMEOUT 1000
#define DEVMEM_CHUNK_SIZE (256 * 1024 * 1024U)
#define DEVMEM_CHUNKS_RESERVE 16
+/*
+ * For device_private pages, dpage is just a dummy struct page
+ * representing a piece of device memory. dmirror_devmem_alloc_page
+ * allocates a real system memory page as backing storage to fake a
+ * real device. zone_device_data points to that backing page. But
+ * for device_coherent memory, the struct page represents real
+ * physical CPU-accessible memory that we can use directly.
+ */
+#define BACKING_PAGE(page) (is_device_private_page((page)) ? \
+ (page)->zone_device_data : (page))
+
+static unsigned long spm_addr_dev0;
+module_param(spm_addr_dev0, long, 0644);
+MODULE_PARM_DESC(spm_addr_dev0,
+ "Specify start address for SPM (special purpose memory) used for device 0. By setting this Coherent device type will be used. Make sure spm_addr_dev1 is set too. Minimum SPM size should be DEVMEM_CHUNK_SIZE.");
+
+static unsigned long spm_addr_dev1;
+module_param(spm_addr_dev1, long, 0644);
+MODULE_PARM_DESC(spm_addr_dev1,
+ "Specify start address for SPM (special purpose memory) used for device 1. By setting this Coherent device type will be used. Make sure spm_addr_dev0 is set too. Minimum SPM size should be DEVMEM_CHUNK_SIZE.");
+
static const struct dev_pagemap_ops dmirror_devmem_ops;
static const struct mmu_interval_notifier_ops dmirror_min_ops;
static dev_t dmirror_dev;
@@ -84,6 +105,7 @@ struct dmirror_chunk {
struct dmirror_device {
struct cdev cdevice;
struct hmm_devmem *devmem;
+ unsigned int zone_device_type;
unsigned int devmem_capacity;
unsigned int devmem_count;
@@ -111,6 +133,21 @@ static int dmirror_bounce_init(struct dmirror_bounce *bounce,
return 0;
}
+static bool dmirror_is_private_zone(struct dmirror_device *mdevice)
+{
+ return (mdevice->zone_device_type ==
+ HMM_DMIRROR_MEMORY_DEVICE_PRIVATE) ? true : false;
+}
+
+static enum migrate_vma_direction
+ dmirror_select_device(struct dmirror *dmirror)
+{
+ return (dmirror->mdevice->zone_device_type ==
+ HMM_DMIRROR_MEMORY_DEVICE_PRIVATE) ?
+ MIGRATE_VMA_SELECT_DEVICE_PRIVATE :
+ MIGRATE_VMA_SELECT_DEVICE_COHERENT;
+}
+
static void dmirror_bounce_fini(struct dmirror_bounce *bounce)
{
vfree(bounce->ptr);
@@ -451,28 +488,44 @@ fini:
return ret;
}
-static bool dmirror_allocate_chunk(struct dmirror_device *mdevice,
+static int dmirror_allocate_chunk(struct dmirror_device *mdevice,
struct page **ppage)
{
struct dmirror_chunk *devmem;
- struct resource *res;
+ struct resource *res = NULL;
unsigned long pfn;
unsigned long pfn_first;
unsigned long pfn_last;
void *ptr;
+ int ret = -ENOMEM;
devmem = kzalloc(sizeof(*devmem), GFP_KERNEL);
if (!devmem)
- return false;
+ return ret;
- res = request_free_mem_region(&iomem_resource, DEVMEM_CHUNK_SIZE,
- "hmm_dmirror");
- if (IS_ERR(res))
+ switch (mdevice->zone_device_type) {
+ case HMM_DMIRROR_MEMORY_DEVICE_PRIVATE:
+ res = request_free_mem_region(&iomem_resource, DEVMEM_CHUNK_SIZE,
+ "hmm_dmirror");
+ if (IS_ERR_OR_NULL(res))
+ goto err_devmem;
+ devmem->pagemap.range.start = res->start;
+ devmem->pagemap.range.end = res->end;
+ devmem->pagemap.type = MEMORY_DEVICE_PRIVATE;
+ break;
+ case HMM_DMIRROR_MEMORY_DEVICE_COHERENT:
+ devmem->pagemap.range.start = (MINOR(mdevice->cdevice.dev) - 2) ?
+ spm_addr_dev0 :
+ spm_addr_dev1;
+ devmem->pagemap.range.end = devmem->pagemap.range.start +
+ DEVMEM_CHUNK_SIZE - 1;
+ devmem->pagemap.type = MEMORY_DEVICE_COHERENT;
+ break;
+ default:
+ ret = -EINVAL;
goto err_devmem;
+ }
- devmem->pagemap.type = MEMORY_DEVICE_PRIVATE;
- devmem->pagemap.range.start = res->start;
- devmem->pagemap.range.end = res->end;
devmem->pagemap.nr_range = 1;
devmem->pagemap.ops = &dmirror_devmem_ops;
devmem->pagemap.owner = mdevice;
@@ -493,10 +546,14 @@ static bool dmirror_allocate_chunk(struct dmirror_device *mdevice,
mdevice->devmem_capacity = new_capacity;
mdevice->devmem_chunks = new_chunks;
}
-
ptr = memremap_pages(&devmem->pagemap, numa_node_id());
- if (IS_ERR(ptr))
+ if (IS_ERR_OR_NULL(ptr)) {
+ if (ptr)
+ ret = PTR_ERR(ptr);
+ else
+ ret = -EFAULT;
goto err_release;
+ }
devmem->mdevice = mdevice;
pfn_first = devmem->pagemap.range.start >> PAGE_SHIFT;
@@ -525,30 +582,35 @@ static bool dmirror_allocate_chunk(struct dmirror_device *mdevice,
}
spin_unlock(&mdevice->lock);
- return true;
+ return 0;
err_release:
mutex_unlock(&mdevice->devmem_lock);
- release_mem_region(devmem->pagemap.range.start, range_len(&devmem->pagemap.range));
+ if (res && devmem->pagemap.type == MEMORY_DEVICE_PRIVATE)
+ release_mem_region(devmem->pagemap.range.start,
+ range_len(&devmem->pagemap.range));
err_devmem:
kfree(devmem);
- return false;
+ return ret;
}
static struct page *dmirror_devmem_alloc_page(struct dmirror_device *mdevice)
{
struct page *dpage = NULL;
- struct page *rpage;
+ struct page *rpage = NULL;
/*
- * This is a fake device so we alloc real system memory to store
- * our device memory.
+ * For ZONE_DEVICE private type, this is a fake device so we alloc real
+ * system memory to store our device memory.
+ * For ZONE_DEVICE coherent type we use the actual dpage to store the data
+ * and ignore rpage.
*/
- rpage = alloc_page(GFP_HIGHUSER);
- if (!rpage)
- return NULL;
-
+ if (dmirror_is_private_zone(mdevice)) {
+ rpage = alloc_page(GFP_HIGHUSER);
+ if (!rpage)
+ return NULL;
+ }
spin_lock(&mdevice->lock);
if (mdevice->free_pages) {
@@ -558,7 +620,7 @@ static struct page *dmirror_devmem_alloc_page(struct dmirror_device *mdevice)
spin_unlock(&mdevice->lock);
} else {
spin_unlock(&mdevice->lock);
- if (!dmirror_allocate_chunk(mdevice, &dpage))
+ if (dmirror_allocate_chunk(mdevice, &dpage))
goto error;
}
@@ -568,7 +630,8 @@ static struct page *dmirror_devmem_alloc_page(struct dmirror_device *mdevice)
return dpage;
error:
- __free_page(rpage);
+ if (rpage)
+ __free_page(rpage);
return NULL;
}
@@ -594,12 +657,16 @@ static void dmirror_migrate_alloc_and_copy(struct migrate_vma *args,
* unallocated pte_none() or read-only zero page.
*/
spage = migrate_pfn_to_page(*src);
+ if (WARN(spage && is_zone_device_page(spage),
+ "page already in device spage pfn: 0x%lx\n",
+ page_to_pfn(spage)))
+ continue;
dpage = dmirror_devmem_alloc_page(mdevice);
if (!dpage)
continue;
- rpage = dpage->zone_device_data;
+ rpage = BACKING_PAGE(dpage);
if (spage)
copy_highpage(rpage, spage);
else
@@ -613,6 +680,8 @@ static void dmirror_migrate_alloc_and_copy(struct migrate_vma *args,
*/
rpage->zone_device_data = dmirror;
+ pr_debug("migrating from sys to dev pfn src: 0x%lx pfn dst: 0x%lx\n",
+ page_to_pfn(spage), page_to_pfn(dpage));
*dst = migrate_pfn(page_to_pfn(dpage));
if ((*src & MIGRATE_PFN_WRITE) ||
(!spage && args->vma->vm_flags & VM_WRITE))
@@ -690,11 +759,7 @@ static int dmirror_migrate_finalize_and_map(struct migrate_vma *args,
if (!dpage)
continue;
- /*
- * Store the page that holds the data so the page table
- * doesn't have to deal with ZONE_DEVICE private pages.
- */
- entry = dpage->zone_device_data;
+ entry = BACKING_PAGE(dpage);
if (*dst & MIGRATE_PFN_WRITE)
entry = xa_tag_pointer(entry, DPT_XA_TAG_WRITE);
entry = xa_store(&dmirror->pt, pfn, entry, GFP_ATOMIC);
@@ -774,15 +839,124 @@ static int dmirror_exclusive(struct dmirror *dmirror,
return ret;
}
-static int dmirror_migrate(struct dmirror *dmirror,
- struct hmm_dmirror_cmd *cmd)
+static vm_fault_t dmirror_devmem_fault_alloc_and_copy(struct migrate_vma *args,
+ struct dmirror *dmirror)
+{
+ const unsigned long *src = args->src;
+ unsigned long *dst = args->dst;
+ unsigned long start = args->start;
+ unsigned long end = args->end;
+ unsigned long addr;
+
+ for (addr = start; addr < end; addr += PAGE_SIZE,
+ src++, dst++) {
+ struct page *dpage, *spage;
+
+ spage = migrate_pfn_to_page(*src);
+ if (!spage || !(*src & MIGRATE_PFN_MIGRATE))
+ continue;
+
+ if (WARN_ON(!is_dev_private_or_coherent_page(spage)))
+ continue;
+ spage = BACKING_PAGE(spage);
+ dpage = alloc_page_vma(GFP_HIGHUSER_MOVABLE, args->vma, addr);
+ if (!dpage)
+ continue;
+ pr_debug("migrating from dev to sys pfn src: 0x%lx pfn dst: 0x%lx\n",
+ page_to_pfn(spage), page_to_pfn(dpage));
+
+ lock_page(dpage);
+ xa_erase(&dmirror->pt, addr >> PAGE_SHIFT);
+ copy_highpage(dpage, spage);
+ *dst = migrate_pfn(page_to_pfn(dpage));
+ if (*src & MIGRATE_PFN_WRITE)
+ *dst |= MIGRATE_PFN_WRITE;
+ }
+ return 0;
+}
+
+static unsigned long dmirror_successful_migrated_pages(struct migrate_vma *migrate)
+{
+ unsigned long cpages = 0;
+ unsigned long i;
+
+ for (i = 0; i < migrate->npages; i++) {
+ if (migrate->src[i] & MIGRATE_PFN_VALID &&
+ migrate->src[i] & MIGRATE_PFN_MIGRATE)
+ cpages++;
+ }
+ return cpages;
+}
+
+static int dmirror_migrate_to_system(struct dmirror *dmirror,
+ struct hmm_dmirror_cmd *cmd)
+{
+ unsigned long start, end, addr;
+ unsigned long size = cmd->npages << PAGE_SHIFT;
+ struct mm_struct *mm = dmirror->notifier.mm;
+ struct vm_area_struct *vma;
+ unsigned long src_pfns[64] = { 0 };
+ unsigned long dst_pfns[64] = { 0 };
+ struct migrate_vma args;
+ unsigned long next;
+ int ret;
+
+ start = cmd->addr;
+ end = start + size;
+ if (end < start)
+ return -EINVAL;
+
+ /* Since the mm is for the mirrored process, get a reference first. */
+ if (!mmget_not_zero(mm))
+ return -EINVAL;
+
+ cmd->cpages = 0;
+ mmap_read_lock(mm);
+ for (addr = start; addr < end; addr = next) {
+ vma = vma_lookup(mm, addr);
+ if (!vma || !(vma->vm_flags & VM_READ)) {
+ ret = -EINVAL;
+ goto out;
+ }
+ next = min(end, addr + (ARRAY_SIZE(src_pfns) << PAGE_SHIFT));
+ if (next > vma->vm_end)
+ next = vma->vm_end;
+
+ args.vma = vma;
+ args.src = src_pfns;
+ args.dst = dst_pfns;
+ args.start = addr;
+ args.end = next;
+ args.pgmap_owner = dmirror->mdevice;
+ args.flags = dmirror_select_device(dmirror);
+
+ ret = migrate_vma_setup(&args);
+ if (ret)
+ goto out;
+
+ pr_debug("Migrating from device mem to sys mem\n");
+ dmirror_devmem_fault_alloc_and_copy(&args, dmirror);
+
+ migrate_vma_pages(&args);
+ cmd->cpages += dmirror_successful_migrated_pages(&args);
+ migrate_vma_finalize(&args);
+ }
+out:
+ mmap_read_unlock(mm);
+ mmput(mm);
+
+ return ret;
+}
+
+static int dmirror_migrate_to_device(struct dmirror *dmirror,
+ struct hmm_dmirror_cmd *cmd)
{
unsigned long start, end, addr;
unsigned long size = cmd->npages << PAGE_SHIFT;
struct mm_struct *mm = dmirror->notifier.mm;
struct vm_area_struct *vma;
- unsigned long src_pfns[64];
- unsigned long dst_pfns[64];
+ unsigned long src_pfns[64] = { 0 };
+ unsigned long dst_pfns[64] = { 0 };
struct dmirror_bounce bounce;
struct migrate_vma args;
unsigned long next;
@@ -819,6 +993,7 @@ static int dmirror_migrate(struct dmirror *dmirror,
if (ret)
goto out;
+ pr_debug("Migrating from sys mem to device mem\n");
dmirror_migrate_alloc_and_copy(&args, dmirror);
migrate_vma_pages(&args);
dmirror_migrate_finalize_and_map(&args, dmirror);
@@ -827,7 +1002,7 @@ static int dmirror_migrate(struct dmirror *dmirror,
mmap_read_unlock(mm);
mmput(mm);
- /* Return the migrated data for verification. */
+ /* Return the migrated data for verification. only for pages in device zone */
ret = dmirror_bounce_init(&bounce, start, size);
if (ret)
return ret;
@@ -864,12 +1039,22 @@ static void dmirror_mkentry(struct dmirror *dmirror, struct hmm_range *range,
}
page = hmm_pfn_to_page(entry);
- if (is_device_private_page(page)) {
- /* Is the page migrated to this device or some other? */
- if (dmirror->mdevice == dmirror_page_to_device(page))
+ if (is_dev_private_or_coherent_page(page)) {
+ /* Is page ZONE_DEVICE coherent? */
+ if (is_device_coherent_page(page)) {
+ if (dmirror->mdevice == dmirror_page_to_device(page))
+ *perm = HMM_DMIRROR_PROT_DEV_COHERENT_LOCAL;
+ else
+ *perm = HMM_DMIRROR_PROT_DEV_COHERENT_REMOTE;
+ /*
+ * Is page ZONE_DEVICE private migrated to
+ * this device or some other?
+ */
+ } else if (dmirror->mdevice == dmirror_page_to_device(page)) {
*perm = HMM_DMIRROR_PROT_DEV_PRIVATE_LOCAL;
- else
+ } else {
*perm = HMM_DMIRROR_PROT_DEV_PRIVATE_REMOTE;
+ }
} else if (is_zero_pfn(page_to_pfn(page)))
*perm = HMM_DMIRROR_PROT_ZERO;
else
@@ -1024,6 +1209,15 @@ static int dmirror_snapshot(struct dmirror *dmirror,
return ret;
}
+static int dmirror_get_device_type(struct dmirror *dmirror,
+ struct hmm_dmirror_cmd *cmd)
+{
+ mutex_lock(&dmirror->mutex);
+ cmd->zone_device_type = dmirror->mdevice->zone_device_type;
+ mutex_unlock(&dmirror->mutex);
+
+ return 0;
+}
static long dmirror_fops_unlocked_ioctl(struct file *filp,
unsigned int command,
unsigned long arg)
@@ -1057,8 +1251,12 @@ static long dmirror_fops_unlocked_ioctl(struct file *filp,
ret = dmirror_write(dmirror, &cmd);
break;
- case HMM_DMIRROR_MIGRATE:
- ret = dmirror_migrate(dmirror, &cmd);
+ case HMM_DMIRROR_MIGRATE_TO_DEV:
+ ret = dmirror_migrate_to_device(dmirror, &cmd);
+ break;
+
+ case HMM_DMIRROR_MIGRATE_TO_SYS:
+ ret = dmirror_migrate_to_system(dmirror, &cmd);
break;
case HMM_DMIRROR_EXCLUSIVE:
@@ -1074,6 +1272,9 @@ static long dmirror_fops_unlocked_ioctl(struct file *filp,
ret = dmirror_snapshot(dmirror, &cmd);
break;
+ case HMM_DMIRROR_GET_MEM_DEV_TYPE:
+ ret = dmirror_get_device_type(dmirror, &cmd);
+ break;
default:
return -EINVAL;
}
@@ -1120,14 +1321,13 @@ static const struct file_operations dmirror_fops = {
static void dmirror_devmem_free(struct page *page)
{
- struct page *rpage = page->zone_device_data;
+ struct page *rpage = BACKING_PAGE(page);
struct dmirror_device *mdevice;
- if (rpage)
+ if (rpage != page)
__free_page(rpage);
mdevice = dmirror_page_to_device(page);
-
spin_lock(&mdevice->lock);
mdevice->cfree++;
page->zone_device_data = mdevice->free_pages;
@@ -1135,43 +1335,11 @@ static void dmirror_devmem_free(struct page *page)
spin_unlock(&mdevice->lock);
}
-static vm_fault_t dmirror_devmem_fault_alloc_and_copy(struct migrate_vma *args,
- struct dmirror *dmirror)
-{
- const unsigned long *src = args->src;
- unsigned long *dst = args->dst;
- unsigned long start = args->start;
- unsigned long end = args->end;
- unsigned long addr;
-
- for (addr = start; addr < end; addr += PAGE_SIZE,
- src++, dst++) {
- struct page *dpage, *spage;
-
- spage = migrate_pfn_to_page(*src);
- if (!spage || !(*src & MIGRATE_PFN_MIGRATE))
- continue;
- spage = spage->zone_device_data;
-
- dpage = alloc_page_vma(GFP_HIGHUSER_MOVABLE, args->vma, addr);
- if (!dpage)
- continue;
-
- lock_page(dpage);
- xa_erase(&dmirror->pt, addr >> PAGE_SHIFT);
- copy_highpage(dpage, spage);
- *dst = migrate_pfn(page_to_pfn(dpage));
- if (*src & MIGRATE_PFN_WRITE)
- *dst |= MIGRATE_PFN_WRITE;
- }
- return 0;
-}
-
static vm_fault_t dmirror_devmem_fault(struct vm_fault *vmf)
{
struct migrate_vma args;
- unsigned long src_pfns;
- unsigned long dst_pfns;
+ unsigned long src_pfns = 0;
+ unsigned long dst_pfns = 0;
struct page *rpage;
struct dmirror *dmirror;
vm_fault_t ret;
@@ -1191,7 +1359,7 @@ static vm_fault_t dmirror_devmem_fault(struct vm_fault *vmf)
args.src = &src_pfns;
args.dst = &dst_pfns;
args.pgmap_owner = dmirror->mdevice;
- args.flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE;
+ args.flags = dmirror_select_device(dmirror);
if (migrate_vma_setup(&args))
return VM_FAULT_SIGBUS;
@@ -1229,10 +1397,8 @@ static int dmirror_device_init(struct dmirror_device *mdevice, int id)
if (ret)
return ret;
- /* Build a list of free ZONE_DEVICE private struct pages */
- dmirror_allocate_chunk(mdevice, NULL);
-
- return 0;
+ /* Build a list of free ZONE_DEVICE struct pages */
+ return dmirror_allocate_chunk(mdevice, NULL);
}
static void dmirror_device_remove(struct dmirror_device *mdevice)
@@ -1245,8 +1411,9 @@ static void dmirror_device_remove(struct dmirror_device *mdevice)
mdevice->devmem_chunks[i];
memunmap_pages(&devmem->pagemap);
- release_mem_region(devmem->pagemap.range.start,
- range_len(&devmem->pagemap.range));
+ if (devmem->pagemap.type == MEMORY_DEVICE_PRIVATE)
+ release_mem_region(devmem->pagemap.range.start,
+ range_len(&devmem->pagemap.range));
kfree(devmem);
}
kfree(mdevice->devmem_chunks);
@@ -1258,14 +1425,26 @@ static void dmirror_device_remove(struct dmirror_device *mdevice)
static int __init hmm_dmirror_init(void)
{
int ret;
- int id;
+ int id = 0;
+ int ndevices = 0;
ret = alloc_chrdev_region(&dmirror_dev, 0, DMIRROR_NDEVICES,
"HMM_DMIRROR");
if (ret)
goto err_unreg;
- for (id = 0; id < DMIRROR_NDEVICES; id++) {
+ memset(dmirror_devices, 0, DMIRROR_NDEVICES * sizeof(dmirror_devices[0]));
+ dmirror_devices[ndevices++].zone_device_type =
+ HMM_DMIRROR_MEMORY_DEVICE_PRIVATE;
+ dmirror_devices[ndevices++].zone_device_type =
+ HMM_DMIRROR_MEMORY_DEVICE_PRIVATE;
+ if (spm_addr_dev0 && spm_addr_dev1) {
+ dmirror_devices[ndevices++].zone_device_type =
+ HMM_DMIRROR_MEMORY_DEVICE_COHERENT;
+ dmirror_devices[ndevices++].zone_device_type =
+ HMM_DMIRROR_MEMORY_DEVICE_COHERENT;
+ }
+ for (id = 0; id < ndevices; id++) {
ret = dmirror_device_init(dmirror_devices + id, id);
if (ret)
goto err_chrdev;
@@ -1287,7 +1466,8 @@ static void __exit hmm_dmirror_exit(void)
int id;
for (id = 0; id < DMIRROR_NDEVICES; id++)
- dmirror_device_remove(dmirror_devices + id);
+ if (dmirror_devices[id].zone_device_type)
+ dmirror_device_remove(dmirror_devices + id);
unregister_chrdev_region(dmirror_dev, DMIRROR_NDEVICES);
}
diff --git a/lib/test_hmm_uapi.h b/lib/test_hmm_uapi.h
index f14dea5dcd062..e190b2ab6f199 100644
--- a/lib/test_hmm_uapi.h
+++ b/lib/test_hmm_uapi.h
@@ -19,6 +19,7 @@
* @npages: (in) number of pages to read/write
* @cpages: (out) number of pages copied
* @faults: (out) number of device page faults seen
+ * @zone_device_type: (out) zone device memory type
*/
struct hmm_dmirror_cmd {
__u64 addr;
@@ -26,15 +27,18 @@ struct hmm_dmirror_cmd {
__u64 npages;
__u64 cpages;
__u64 faults;
+ __u64 zone_device_type;
};
/* Expose the address space of the calling process through hmm device file */
#define HMM_DMIRROR_READ _IOWR('H', 0x00, struct hmm_dmirror_cmd)
#define HMM_DMIRROR_WRITE _IOWR('H', 0x01, struct hmm_dmirror_cmd)
-#define HMM_DMIRROR_MIGRATE _IOWR('H', 0x02, struct hmm_dmirror_cmd)
-#define HMM_DMIRROR_SNAPSHOT _IOWR('H', 0x03, struct hmm_dmirror_cmd)
-#define HMM_DMIRROR_EXCLUSIVE _IOWR('H', 0x04, struct hmm_dmirror_cmd)
-#define HMM_DMIRROR_CHECK_EXCLUSIVE _IOWR('H', 0x05, struct hmm_dmirror_cmd)
+#define HMM_DMIRROR_MIGRATE_TO_DEV _IOWR('H', 0x02, struct hmm_dmirror_cmd)
+#define HMM_DMIRROR_MIGRATE_TO_SYS _IOWR('H', 0x03, struct hmm_dmirror_cmd)
+#define HMM_DMIRROR_SNAPSHOT _IOWR('H', 0x04, struct hmm_dmirror_cmd)
+#define HMM_DMIRROR_EXCLUSIVE _IOWR('H', 0x05, struct hmm_dmirror_cmd)
+#define HMM_DMIRROR_CHECK_EXCLUSIVE _IOWR('H', 0x06, struct hmm_dmirror_cmd)
+#define HMM_DMIRROR_GET_MEM_DEV_TYPE _IOWR('H', 0x07, struct hmm_dmirror_cmd)
/*
* Values returned in hmm_dmirror_cmd.ptr for HMM_DMIRROR_SNAPSHOT.
@@ -49,6 +53,8 @@ struct hmm_dmirror_cmd {
* device the ioctl() is made
* HMM_DMIRROR_PROT_DEV_PRIVATE_REMOTE: Migrated device private page on some
* other device
+ * HMM_DMIRROR_PROT_DEV_COHERENT: Migrate device coherent page on the device
+ * the ioctl() is made
*/
enum {
HMM_DMIRROR_PROT_ERROR = 0xFF,
@@ -60,6 +66,14 @@ enum {
HMM_DMIRROR_PROT_ZERO = 0x10,
HMM_DMIRROR_PROT_DEV_PRIVATE_LOCAL = 0x20,
HMM_DMIRROR_PROT_DEV_PRIVATE_REMOTE = 0x30,
+ HMM_DMIRROR_PROT_DEV_COHERENT_LOCAL = 0x40,
+ HMM_DMIRROR_PROT_DEV_COHERENT_REMOTE = 0x50,
+};
+
+enum {
+ /* 0 is reserved to catch uninitialized type fields */
+ HMM_DMIRROR_MEMORY_DEVICE_PRIVATE = 1,
+ HMM_DMIRROR_MEMORY_DEVICE_COHERENT,
};
#endif /* _LIB_TEST_HMM_UAPI_H */
diff --git a/lib/test_kasan.c b/lib/test_kasan.c
index 26a5c9007653a..4311cf5319557 100644
--- a/lib/test_kasan.c
+++ b/lib/test_kasan.c
@@ -19,6 +19,7 @@
#include <linux/uaccess.h>
#include <linux/io.h>
#include <linux/vmalloc.h>
+#include <linux/set_memory.h>
#include <asm/page.h>
@@ -869,11 +870,14 @@ static void kmem_cache_invalid_free(struct kunit *test)
kmem_cache_destroy(cache);
}
+static void empty_cache_ctor(void *object) { }
+
static void kmem_cache_double_destroy(struct kunit *test)
{
struct kmem_cache *cache;
- cache = kmem_cache_create("test_cache", 200, 0, 0, NULL);
+ /* Provide a constructor to prevent cache merging. */
+ cache = kmem_cache_create("test_cache", 200, 0, 0, empty_cache_ctor);
KUNIT_ASSERT_NOT_ERR_OR_NULL(test, cache);
kmem_cache_destroy(cache);
KUNIT_EXPECT_KASAN_FAIL(test, kmem_cache_destroy(cache));
@@ -1054,21 +1058,183 @@ static void kmalloc_double_kzfree(struct kunit *test)
KUNIT_EXPECT_KASAN_FAIL(test, kfree_sensitive(ptr));
}
+static void vmalloc_helpers_tags(struct kunit *test)
+{
+ void *ptr;
+ int rv;
+
+ /* This test is intended for tag-based modes. */
+ KASAN_TEST_NEEDS_CONFIG_OFF(test, CONFIG_KASAN_GENERIC);
+
+ KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_VMALLOC);
+
+ ptr = vmalloc(PAGE_SIZE);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+
+ /* Check that the returned pointer is tagged. */
+ KUNIT_EXPECT_GE(test, (u8)get_tag(ptr), (u8)KASAN_TAG_MIN);
+ KUNIT_EXPECT_LT(test, (u8)get_tag(ptr), (u8)KASAN_TAG_KERNEL);
+
+ /* Make sure exported vmalloc helpers handle tagged pointers. */
+ KUNIT_ASSERT_TRUE(test, is_vmalloc_addr(ptr));
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, vmalloc_to_page(ptr));
+
+ /* Make sure vmalloc'ed memory permissions can be changed. */
+ rv = set_memory_ro((unsigned long)ptr, 1);
+ KUNIT_ASSERT_GE(test, rv, 0);
+ rv = set_memory_rw((unsigned long)ptr, 1);
+ KUNIT_ASSERT_GE(test, rv, 0);
+
+ vfree(ptr);
+}
+
static void vmalloc_oob(struct kunit *test)
{
- void *area;
+ char *v_ptr, *p_ptr;
+ struct page *page;
+ size_t size = PAGE_SIZE / 2 - KASAN_GRANULE_SIZE - 5;
KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_VMALLOC);
+ v_ptr = vmalloc(size);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, v_ptr);
+
+ OPTIMIZER_HIDE_VAR(v_ptr);
+
/*
- * We have to be careful not to hit the guard page.
+ * We have to be careful not to hit the guard page in vmalloc tests.
* The MMU will catch that and crash us.
*/
- area = vmalloc(3000);
- KUNIT_ASSERT_NOT_ERR_OR_NULL(test, area);
- KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)area)[3100]);
- vfree(area);
+ /* Make sure in-bounds accesses are valid. */
+ v_ptr[0] = 0;
+ v_ptr[size - 1] = 0;
+
+ /*
+ * An unaligned access past the requested vmalloc size.
+ * Only generic KASAN can precisely detect these.
+ */
+ if (IS_ENABLED(CONFIG_KASAN_GENERIC))
+ KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)v_ptr)[size]);
+
+ /* An aligned access into the first out-of-bounds granule. */
+ KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)v_ptr)[size + 5]);
+
+ /* Check that in-bounds accesses to the physical page are valid. */
+ page = vmalloc_to_page(v_ptr);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, page);
+ p_ptr = page_address(page);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, p_ptr);
+ p_ptr[0] = 0;
+
+ vfree(v_ptr);
+
+ /*
+ * We can't check for use-after-unmap bugs in this nor in the following
+ * vmalloc tests, as the page might be fully unmapped and accessing it
+ * will crash the kernel.
+ */
+}
+
+static void vmap_tags(struct kunit *test)
+{
+ char *p_ptr, *v_ptr;
+ struct page *p_page, *v_page;
+ size_t order = 1;
+
+ /*
+ * This test is specifically crafted for the software tag-based mode,
+ * the only tag-based mode that poisons vmap mappings.
+ */
+ KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_SW_TAGS);
+
+ KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_VMALLOC);
+
+ p_page = alloc_pages(GFP_KERNEL, order);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, p_page);
+ p_ptr = page_address(p_page);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, p_ptr);
+
+ v_ptr = vmap(&p_page, 1 << order, VM_MAP, PAGE_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, v_ptr);
+
+ /*
+ * We can't check for out-of-bounds bugs in this nor in the following
+ * vmalloc tests, as allocations have page granularity and accessing
+ * the guard page will crash the kernel.
+ */
+
+ KUNIT_EXPECT_GE(test, (u8)get_tag(v_ptr), (u8)KASAN_TAG_MIN);
+ KUNIT_EXPECT_LT(test, (u8)get_tag(v_ptr), (u8)KASAN_TAG_KERNEL);
+
+ /* Make sure that in-bounds accesses through both pointers work. */
+ *p_ptr = 0;
+ *v_ptr = 0;
+
+ /* Make sure vmalloc_to_page() correctly recovers the page pointer. */
+ v_page = vmalloc_to_page(v_ptr);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, v_page);
+ KUNIT_EXPECT_PTR_EQ(test, p_page, v_page);
+
+ vunmap(v_ptr);
+ free_pages((unsigned long)p_ptr, order);
+}
+
+static void vm_map_ram_tags(struct kunit *test)
+{
+ char *p_ptr, *v_ptr;
+ struct page *page;
+ size_t order = 1;
+
+ /*
+ * This test is specifically crafted for the software tag-based mode,
+ * the only tag-based mode that poisons vm_map_ram mappings.
+ */
+ KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_SW_TAGS);
+
+ page = alloc_pages(GFP_KERNEL, order);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, page);
+ p_ptr = page_address(page);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, p_ptr);
+
+ v_ptr = vm_map_ram(&page, 1 << order, -1);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, v_ptr);
+
+ KUNIT_EXPECT_GE(test, (u8)get_tag(v_ptr), (u8)KASAN_TAG_MIN);
+ KUNIT_EXPECT_LT(test, (u8)get_tag(v_ptr), (u8)KASAN_TAG_KERNEL);
+
+ /* Make sure that in-bounds accesses through both pointers work. */
+ *p_ptr = 0;
+ *v_ptr = 0;
+
+ vm_unmap_ram(v_ptr, 1 << order);
+ free_pages((unsigned long)p_ptr, order);
+}
+
+static void vmalloc_percpu(struct kunit *test)
+{
+ char __percpu *ptr;
+ int cpu;
+
+ /*
+ * This test is specifically crafted for the software tag-based mode,
+ * the only tag-based mode that poisons percpu mappings.
+ */
+ KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_SW_TAGS);
+
+ ptr = __alloc_percpu(PAGE_SIZE, PAGE_SIZE);
+
+ for_each_possible_cpu(cpu) {
+ char *c_ptr = per_cpu_ptr(ptr, cpu);
+
+ KUNIT_EXPECT_GE(test, (u8)get_tag(c_ptr), (u8)KASAN_TAG_MIN);
+ KUNIT_EXPECT_LT(test, (u8)get_tag(c_ptr), (u8)KASAN_TAG_KERNEL);
+
+ /* Make sure that in-bounds accesses don't crash the kernel. */
+ *c_ptr = 0;
+ }
+
+ free_percpu(ptr);
}
/*
@@ -1102,6 +1268,18 @@ static void match_all_not_assigned(struct kunit *test)
KUNIT_EXPECT_LT(test, (u8)get_tag(ptr), (u8)KASAN_TAG_KERNEL);
free_pages((unsigned long)ptr, order);
}
+
+ if (!IS_ENABLED(CONFIG_KASAN_VMALLOC))
+ return;
+
+ for (i = 0; i < 256; i++) {
+ size = (get_random_int() % 1024) + 1;
+ ptr = vmalloc(size);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+ KUNIT_EXPECT_GE(test, (u8)get_tag(ptr), (u8)KASAN_TAG_MIN);
+ KUNIT_EXPECT_LT(test, (u8)get_tag(ptr), (u8)KASAN_TAG_KERNEL);
+ vfree(ptr);
+ }
}
/* Check that 0xff works as a match-all pointer tag for tag-based modes. */
@@ -1207,7 +1385,11 @@ static struct kunit_case kasan_kunit_test_cases[] = {
KUNIT_CASE(kasan_bitops_generic),
KUNIT_CASE(kasan_bitops_tags),
KUNIT_CASE(kmalloc_double_kzfree),
+ KUNIT_CASE(vmalloc_helpers_tags),
KUNIT_CASE(vmalloc_oob),
+ KUNIT_CASE(vmap_tags),
+ KUNIT_CASE(vm_map_ram_tags),
+ KUNIT_CASE(vmalloc_percpu),
KUNIT_CASE(match_all_not_assigned),
KUNIT_CASE(match_all_ptr_tag),
KUNIT_CASE(match_all_mem_tag),
diff --git a/lib/test_printf.c b/lib/test_printf.c
index 07309c45f3279..8010de49b6c5d 100644
--- a/lib/test_printf.c
+++ b/lib/test_printf.c
@@ -673,17 +673,17 @@ flags(void)
gfp = GFP_ATOMIC|__GFP_DMA;
test("GFP_ATOMIC|GFP_DMA", "%pGg", &gfp);
- gfp = __GFP_ATOMIC;
- test("__GFP_ATOMIC", "%pGg", &gfp);
+ gfp = __GFP_HIGH;
+ test("__GFP_HIGH", "%pGg", &gfp);
/* Any flags not translated by the table should remain numeric */
gfp = ~__GFP_BITS_MASK;
snprintf(cmp_buffer, BUF_SIZE, "%#lx", (unsigned long) gfp);
test(cmp_buffer, "%pGg", &gfp);
- snprintf(cmp_buffer, BUF_SIZE, "__GFP_ATOMIC|%#lx",
+ snprintf(cmp_buffer, BUF_SIZE, "__GFP_HIGH|%#lx",
(unsigned long) gfp);
- gfp |= __GFP_ATOMIC;
+ gfp |= __GFP_HIGH;
test(cmp_buffer, "%pGg", &gfp);
kfree(cmp_buffer);
diff --git a/lib/ubsan.c b/lib/ubsan.c
index bdc380ff5d5c7..36bd75e334263 100644
--- a/lib/ubsan.c
+++ b/lib/ubsan.c
@@ -154,16 +154,8 @@ static void ubsan_epilogue(void)
current->in_ubsan--;
- if (panic_on_warn) {
- /*
- * This thread may hit another WARN() in the panic path.
- * Resetting this prevents additional WARN() from panicking the
- * system on this thread. Other threads are blocked by the
- * panic_mutex in panic().
- */
- panic_on_warn = 0;
+ if (panic_on_warn)
panic("panic_on_warn set ...\n");
- }
}
void __ubsan_handle_divrem_overflow(void *_data, void *lhs, void *rhs)
diff --git a/lib/vsprintf.c b/lib/vsprintf.c
index a1babe5e07d10..229cbc666e020 100644
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -986,9 +986,7 @@ char *symbol_string(char *buf, char *end, void *ptr,
struct printf_spec spec, const char *fmt)
{
unsigned long value;
-#ifdef CONFIG_KALLSYMS
char sym[KSYM_SYMBOL_LEN];
-#endif
if (fmt[1] == 'R')
ptr = __builtin_extract_return_addr(ptr);
@@ -1008,6 +1006,9 @@ char *symbol_string(char *buf, char *end, void *ptr,
return string_nocheck(buf, end, sym, spec);
#else
+ if (fill_minimal_module_info(sym, KSYM_SYMBOL_LEN, value))
+ return string_nocheck(buf, end, sym, spec);
+
return special_hex_number(buf, end, value, sizeof(void *));
#endif
}
@@ -2905,13 +2906,15 @@ int vscnprintf(char *buf, size_t size, const char *fmt, va_list args)
{
int i;
+ if (unlikely(!size))
+ return 0;
+
i = vsnprintf(buf, size, fmt, args);
if (likely(i < size))
return i;
- if (size != 0)
- return size - 1;
- return 0;
+
+ return size - 1;
}
EXPORT_SYMBOL(vscnprintf);
diff --git a/mm/Kconfig b/mm/Kconfig
index 3326ee3903f33..edb6ced1c8e4f 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -411,6 +411,9 @@ choice
benefit.
endchoice
+config ARCH_WANT_GENERAL_HUGETLB
+ bool
+
config ARCH_WANTS_THP_SWAP
def_bool n
@@ -744,6 +747,9 @@ config IDLE_PAGE_TRACKING
config ARCH_HAS_CACHE_LINE_SIZE
bool
+config ARCH_HAS_FILTER_PGPROT
+ bool
+
config ARCH_HAS_PTE_DEVMAP
bool
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index eae96dfe0261c..7176af65b103a 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -1005,60 +1005,3 @@ const char *bdi_dev_name(struct backing_dev_info *bdi)
return bdi->dev_name;
}
EXPORT_SYMBOL_GPL(bdi_dev_name);
-
-static wait_queue_head_t congestion_wqh[2] = {
- __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]),
- __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1])
- };
-static atomic_t nr_wb_congested[2];
-
-void clear_bdi_congested(struct backing_dev_info *bdi, int sync)
-{
- wait_queue_head_t *wqh = &congestion_wqh[sync];
- enum wb_congested_state bit;
-
- bit = sync ? WB_sync_congested : WB_async_congested;
- if (test_and_clear_bit(bit, &bdi->wb.congested))
- atomic_dec(&nr_wb_congested[sync]);
- smp_mb__after_atomic();
- if (waitqueue_active(wqh))
- wake_up(wqh);
-}
-EXPORT_SYMBOL(clear_bdi_congested);
-
-void set_bdi_congested(struct backing_dev_info *bdi, int sync)
-{
- enum wb_congested_state bit;
-
- bit = sync ? WB_sync_congested : WB_async_congested;
- if (!test_and_set_bit(bit, &bdi->wb.congested))
- atomic_inc(&nr_wb_congested[sync]);
-}
-EXPORT_SYMBOL(set_bdi_congested);
-
-/**
- * congestion_wait - wait for a backing_dev to become uncongested
- * @sync: SYNC or ASYNC IO
- * @timeout: timeout in jiffies
- *
- * Waits for up to @timeout jiffies for a backing_dev (any backing_dev) to exit
- * write congestion. If no backing_devs are congested then just wait for the
- * next write to be completed.
- */
-long congestion_wait(int sync, long timeout)
-{
- long ret;
- unsigned long start = jiffies;
- DEFINE_WAIT(wait);
- wait_queue_head_t *wqh = &congestion_wqh[sync];
-
- prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
- ret = io_schedule_timeout(timeout);
- finish_wait(wqh, &wait);
-
- trace_writeback_congestion_wait(jiffies_to_usecs(timeout),
- jiffies_to_usecs(jiffies - start));
-
- return ret;
-}
-EXPORT_SYMBOL(congestion_wait);
diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c
index 907fefde25728..4b8eab4b3f456 100644
--- a/mm/balloon_compaction.c
+++ b/mm/balloon_compaction.c
@@ -203,7 +203,7 @@ EXPORT_SYMBOL_GPL(balloon_page_dequeue);
#ifdef CONFIG_BALLOON_COMPACTION
-bool balloon_page_isolate(struct page *page, isolate_mode_t mode)
+static bool balloon_page_isolate(struct page *page, isolate_mode_t mode)
{
struct balloon_dev_info *b_dev_info = balloon_page_device(page);
@@ -217,7 +217,7 @@ bool balloon_page_isolate(struct page *page, isolate_mode_t mode)
return true;
}
-void balloon_page_putback(struct page *page)
+static void balloon_page_putback(struct page *page)
{
struct balloon_dev_info *b_dev_info = balloon_page_device(page);
unsigned long flags;
@@ -230,7 +230,7 @@ void balloon_page_putback(struct page *page)
/* move_to_new_page() counterpart for a ballooned page */
-int balloon_page_migrate(struct address_space *mapping,
+static int balloon_page_migrate(struct address_space *mapping,
struct page *newpage, struct page *page,
enum migrate_mode mode)
{
diff --git a/mm/cma.c b/mm/cma.c
index bc9ca8f3c4871..766f1b82b5322 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -131,8 +131,10 @@ not_in_zone:
bitmap_free(cma->bitmap);
out_error:
/* Expose all pages to the buddy, they are useless for CMA. */
- for (pfn = base_pfn; pfn < base_pfn + cma->count; pfn++)
- free_reserved_page(pfn_to_page(pfn));
+ if (!cma->reserve_pages_on_error) {
+ for (pfn = base_pfn; pfn < base_pfn + cma->count; pfn++)
+ free_reserved_page(pfn_to_page(pfn));
+ }
totalcma_pages -= cma->count;
cma->count = 0;
pr_err("CMA area %s could not be activated\n", cma->name);
@@ -150,6 +152,11 @@ static int __init cma_init_reserved_areas(void)
}
core_initcall(cma_init_reserved_areas);
+void __init cma_reserve_pages_on_error(struct cma *cma)
+{
+ cma->reserve_pages_on_error = true;
+}
+
/**
* cma_init_reserved_mem() - create custom contiguous area from reserved memory
* @base: Base address of the reserved area
diff --git a/mm/cma.h b/mm/cma.h
index 2c775877eae24..88a0595670b76 100644
--- a/mm/cma.h
+++ b/mm/cma.h
@@ -30,6 +30,7 @@ struct cma {
/* kobject requires dynamic object */
struct cma_kobject *cma_kobj;
#endif
+ bool reserve_pages_on_error;
};
extern struct cma cma_areas[MAX_CMA_AREAS];
diff --git a/mm/damon/core-test.h b/mm/damon/core-test.h
index 7008c3735e99f..b4085deb9fa05 100644
--- a/mm/damon/core-test.h
+++ b/mm/damon/core-test.h
@@ -24,7 +24,7 @@ static void damon_test_regions(struct kunit *test)
KUNIT_EXPECT_EQ(test, 2ul, r->ar.end);
KUNIT_EXPECT_EQ(test, 0u, r->nr_accesses);
- t = damon_new_target(42);
+ t = damon_new_target();
KUNIT_EXPECT_EQ(test, 0u, damon_nr_regions(t));
damon_add_region(r, t);
@@ -52,8 +52,7 @@ static void damon_test_target(struct kunit *test)
struct damon_ctx *c = damon_new_ctx();
struct damon_target *t;
- t = damon_new_target(42);
- KUNIT_EXPECT_EQ(test, 42ul, t->id);
+ t = damon_new_target();
KUNIT_EXPECT_EQ(test, 0u, nr_damon_targets(c));
damon_add_target(c, t);
@@ -78,7 +77,6 @@ static void damon_test_target(struct kunit *test)
static void damon_test_aggregate(struct kunit *test)
{
struct damon_ctx *ctx = damon_new_ctx();
- unsigned long target_ids[] = {1, 2, 3};
unsigned long saddr[][3] = {{10, 20, 30}, {5, 42, 49}, {13, 33, 55} };
unsigned long eaddr[][3] = {{15, 27, 40}, {31, 45, 55}, {23, 44, 66} };
unsigned long accesses[][3] = {{42, 95, 84}, {10, 20, 30}, {0, 1, 2} };
@@ -86,7 +84,10 @@ static void damon_test_aggregate(struct kunit *test)
struct damon_region *r;
int it, ir;
- damon_set_targets(ctx, target_ids, 3);
+ for (it = 0; it < 3; it++) {
+ t = damon_new_target();
+ damon_add_target(ctx, t);
+ }
it = 0;
damon_for_each_target(t, ctx) {
@@ -122,7 +123,7 @@ static void damon_test_split_at(struct kunit *test)
struct damon_target *t;
struct damon_region *r;
- t = damon_new_target(42);
+ t = damon_new_target();
r = damon_new_region(0, 100);
damon_add_region(r, t);
damon_split_region_at(c, t, r, 25);
@@ -143,7 +144,7 @@ static void damon_test_merge_two(struct kunit *test)
struct damon_region *r, *r2, *r3;
int i;
- t = damon_new_target(42);
+ t = damon_new_target();
r = damon_new_region(0, 100);
r->nr_accesses = 10;
damon_add_region(r, t);
@@ -191,7 +192,7 @@ static void damon_test_merge_regions_of(struct kunit *test)
unsigned long eaddrs[] = {112, 130, 156, 170, 230};
int i;
- t = damon_new_target(42);
+ t = damon_new_target();
for (i = 0; i < ARRAY_SIZE(sa); i++) {
r = damon_new_region(sa[i], ea[i]);
r->nr_accesses = nrs[i];
@@ -215,14 +216,14 @@ static void damon_test_split_regions_of(struct kunit *test)
struct damon_target *t;
struct damon_region *r;
- t = damon_new_target(42);
+ t = damon_new_target();
r = damon_new_region(0, 22);
damon_add_region(r, t);
damon_split_regions_of(c, t, 2);
KUNIT_EXPECT_LE(test, damon_nr_regions(t), 2u);
damon_free_target(t);
- t = damon_new_target(42);
+ t = damon_new_target();
r = damon_new_region(0, 220);
damon_add_region(r, t);
damon_split_regions_of(c, t, 4);
diff --git a/mm/damon/core.c b/mm/damon/core.c
index 1dd153c31c9e2..bf495236d741b 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -144,7 +144,7 @@ void damon_destroy_scheme(struct damos *s)
*
* Returns the pointer to the new struct if success, or NULL otherwise
*/
-struct damon_target *damon_new_target(unsigned long id)
+struct damon_target *damon_new_target(void)
{
struct damon_target *t;
@@ -152,7 +152,7 @@ struct damon_target *damon_new_target(unsigned long id)
if (!t)
return NULL;
- t->id = id;
+ t->pid = NULL;
t->nr_regions = 0;
INIT_LIST_HEAD(&t->regions_list);
@@ -246,38 +246,6 @@ void damon_destroy_ctx(struct damon_ctx *ctx)
}
/**
- * damon_set_targets() - Set monitoring targets.
- * @ctx: monitoring context
- * @ids: array of target ids
- * @nr_ids: number of entries in @ids
- *
- * This function should not be called while the kdamond is running.
- *
- * Return: 0 on success, negative error code otherwise.
- */
-int damon_set_targets(struct damon_ctx *ctx,
- unsigned long *ids, ssize_t nr_ids)
-{
- ssize_t i;
- struct damon_target *t, *next;
-
- damon_destroy_targets(ctx);
-
- for (i = 0; i < nr_ids; i++) {
- t = damon_new_target(ids[i]);
- if (!t) {
- /* The caller should do cleanup of the ids itself */
- damon_for_each_target_safe(t, next, ctx)
- damon_destroy_target(t);
- return -ENOMEM;
- }
- damon_add_target(ctx, t);
- }
-
- return 0;
-}
-
-/**
* damon_set_attrs() - Set attributes for the monitoring.
* @ctx: monitoring context
* @sample_int: time interval between samplings
diff --git a/mm/damon/dbgfs-test.h b/mm/damon/dbgfs-test.h
index 86b9f9528231e..0d3a14c00acfb 100644
--- a/mm/damon/dbgfs-test.h
+++ b/mm/damon/dbgfs-test.h
@@ -12,66 +12,58 @@
#include <kunit/test.h>
-static void damon_dbgfs_test_str_to_target_ids(struct kunit *test)
+static void damon_dbgfs_test_str_to_ints(struct kunit *test)
{
char *question;
- unsigned long *answers;
- unsigned long expected[] = {12, 35, 46};
+ int *answers;
+ int expected[] = {12, 35, 46};
ssize_t nr_integers = 0, i;
question = "123";
- answers = str_to_target_ids(question, strlen(question),
- &nr_integers);
+ answers = str_to_ints(question, strlen(question), &nr_integers);
KUNIT_EXPECT_EQ(test, (ssize_t)1, nr_integers);
- KUNIT_EXPECT_EQ(test, 123ul, answers[0]);
+ KUNIT_EXPECT_EQ(test, 123, answers[0]);
kfree(answers);
question = "123abc";
- answers = str_to_target_ids(question, strlen(question),
- &nr_integers);
+ answers = str_to_ints(question, strlen(question), &nr_integers);
KUNIT_EXPECT_EQ(test, (ssize_t)1, nr_integers);
- KUNIT_EXPECT_EQ(test, 123ul, answers[0]);
+ KUNIT_EXPECT_EQ(test, 123, answers[0]);
kfree(answers);
question = "a123";
- answers = str_to_target_ids(question, strlen(question),
- &nr_integers);
+ answers = str_to_ints(question, strlen(question), &nr_integers);
KUNIT_EXPECT_EQ(test, (ssize_t)0, nr_integers);
kfree(answers);
question = "12 35";
- answers = str_to_target_ids(question, strlen(question),
- &nr_integers);
+ answers = str_to_ints(question, strlen(question), &nr_integers);
KUNIT_EXPECT_EQ(test, (ssize_t)2, nr_integers);
for (i = 0; i < nr_integers; i++)
KUNIT_EXPECT_EQ(test, expected[i], answers[i]);
kfree(answers);
question = "12 35 46";
- answers = str_to_target_ids(question, strlen(question),
- &nr_integers);
+ answers = str_to_ints(question, strlen(question), &nr_integers);
KUNIT_EXPECT_EQ(test, (ssize_t)3, nr_integers);
for (i = 0; i < nr_integers; i++)
KUNIT_EXPECT_EQ(test, expected[i], answers[i]);
kfree(answers);
question = "12 35 abc 46";
- answers = str_to_target_ids(question, strlen(question),
- &nr_integers);
+ answers = str_to_ints(question, strlen(question), &nr_integers);
KUNIT_EXPECT_EQ(test, (ssize_t)2, nr_integers);
for (i = 0; i < 2; i++)
KUNIT_EXPECT_EQ(test, expected[i], answers[i]);
kfree(answers);
question = "";
- answers = str_to_target_ids(question, strlen(question),
- &nr_integers);
+ answers = str_to_ints(question, strlen(question), &nr_integers);
KUNIT_EXPECT_EQ(test, (ssize_t)0, nr_integers);
kfree(answers);
question = "\n";
- answers = str_to_target_ids(question, strlen(question),
- &nr_integers);
+ answers = str_to_ints(question, strlen(question), &nr_integers);
KUNIT_EXPECT_EQ(test, (ssize_t)0, nr_integers);
kfree(answers);
}
@@ -79,30 +71,20 @@ static void damon_dbgfs_test_str_to_target_ids(struct kunit *test)
static void damon_dbgfs_test_set_targets(struct kunit *test)
{
struct damon_ctx *ctx = dbgfs_new_ctx();
- unsigned long ids[] = {1, 2, 3};
char buf[64];
- /* Make DAMON consider target id as plain number */
- ctx->primitive.target_valid = NULL;
- ctx->primitive.cleanup = NULL;
+ /* Make DAMON consider target has no pid */
+ ctx->primitive = (struct damon_primitive){};
- damon_set_targets(ctx, ids, 3);
- sprint_target_ids(ctx, buf, 64);
- KUNIT_EXPECT_STREQ(test, (char *)buf, "1 2 3\n");
-
- damon_set_targets(ctx, NULL, 0);
+ dbgfs_set_targets(ctx, 0, NULL);
sprint_target_ids(ctx, buf, 64);
KUNIT_EXPECT_STREQ(test, (char *)buf, "\n");
- damon_set_targets(ctx, (unsigned long []){1, 2}, 2);
- sprint_target_ids(ctx, buf, 64);
- KUNIT_EXPECT_STREQ(test, (char *)buf, "1 2\n");
-
- damon_set_targets(ctx, (unsigned long []){2}, 1);
+ dbgfs_set_targets(ctx, 1, NULL);
sprint_target_ids(ctx, buf, 64);
- KUNIT_EXPECT_STREQ(test, (char *)buf, "2\n");
+ KUNIT_EXPECT_STREQ(test, (char *)buf, "42\n");
- damon_set_targets(ctx, NULL, 0);
+ dbgfs_set_targets(ctx, 0, NULL);
sprint_target_ids(ctx, buf, 64);
KUNIT_EXPECT_STREQ(test, (char *)buf, "\n");
@@ -112,25 +94,24 @@ static void damon_dbgfs_test_set_targets(struct kunit *test)
static void damon_dbgfs_test_set_init_regions(struct kunit *test)
{
struct damon_ctx *ctx = damon_new_ctx();
- unsigned long ids[] = {1, 2, 3};
- /* Each line represents one region in ``<target id> <start> <end>`` */
- char * const valid_inputs[] = {"2 10 20\n 2 20 30\n2 35 45",
- "2 10 20\n",
- "2 10 20\n1 39 59\n1 70 134\n 2 20 25\n",
+ /* Each line represents one region in ``<target idx> <start> <end>`` */
+ char * const valid_inputs[] = {"1 10 20\n 1 20 30\n1 35 45",
+ "1 10 20\n",
+ "1 10 20\n0 39 59\n0 70 134\n 1 20 25\n",
""};
/* Reading the file again will show sorted, clean output */
- char * const valid_expects[] = {"2 10 20\n2 20 30\n2 35 45\n",
- "2 10 20\n",
- "1 39 59\n1 70 134\n2 10 20\n2 20 25\n",
+ char * const valid_expects[] = {"1 10 20\n1 20 30\n1 35 45\n",
+ "1 10 20\n",
+ "0 39 59\n0 70 134\n1 10 20\n1 20 25\n",
""};
- char * const invalid_inputs[] = {"4 10 20\n", /* target not exists */
- "2 10 20\n 2 14 26\n", /* regions overlap */
- "1 10 20\n2 30 40\n 1 5 8"}; /* not sorted by address */
+ char * const invalid_inputs[] = {"3 10 20\n", /* target not exists */
+ "1 10 20\n 1 14 26\n", /* regions overlap */
+ "0 10 20\n1 30 40\n 0 5 8"}; /* not sorted by address */
char *input, *expect;
int i, rc;
char buf[256];
- damon_set_targets(ctx, ids, 3);
+ dbgfs_set_targets(ctx, 3, NULL);
/* Put valid inputs and check the results */
for (i = 0; i < ARRAY_SIZE(valid_inputs); i++) {
@@ -158,12 +139,12 @@ static void damon_dbgfs_test_set_init_regions(struct kunit *test)
KUNIT_EXPECT_STREQ(test, (char *)buf, "");
}
- damon_set_targets(ctx, NULL, 0);
+ dbgfs_set_targets(ctx, 0, NULL);
damon_destroy_ctx(ctx);
}
static struct kunit_case damon_test_cases[] = {
- KUNIT_CASE(damon_dbgfs_test_str_to_target_ids),
+ KUNIT_CASE(damon_dbgfs_test_str_to_ints),
KUNIT_CASE(damon_dbgfs_test_set_targets),
KUNIT_CASE(damon_dbgfs_test_set_init_regions),
{},
diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c
index 5b899601e56c3..78ff645433c64 100644
--- a/mm/damon/dbgfs.c
+++ b/mm/damon/dbgfs.c
@@ -275,7 +275,7 @@ out:
return ret;
}
-static inline bool targetid_is_pid(const struct damon_ctx *ctx)
+static inline bool target_has_pid(const struct damon_ctx *ctx)
{
return ctx->primitive.target_valid == damon_va_target_valid;
}
@@ -283,17 +283,19 @@ static inline bool targetid_is_pid(const struct damon_ctx *ctx)
static ssize_t sprint_target_ids(struct damon_ctx *ctx, char *buf, ssize_t len)
{
struct damon_target *t;
- unsigned long id;
+ int id;
int written = 0;
int rc;
damon_for_each_target(t, ctx) {
- id = t->id;
- if (targetid_is_pid(ctx))
+ if (target_has_pid(ctx))
/* Show pid numbers to debugfs users */
- id = (unsigned long)pid_vnr((struct pid *)id);
+ id = pid_vnr(t->pid);
+ else
+ /* Show 42 for physical address space, just for fun */
+ id = 42;
- rc = scnprintf(&buf[written], len - written, "%lu ", id);
+ rc = scnprintf(&buf[written], len - written, "%d ", id);
if (!rc)
return -ENOMEM;
written += rc;
@@ -321,54 +323,129 @@ static ssize_t dbgfs_target_ids_read(struct file *file,
}
/*
- * Converts a string into an array of unsigned long integers
+ * Converts a string into an integers array
*
- * Returns an array of unsigned long integers if the conversion success, or
- * NULL otherwise.
+ * Returns an array of integers array if the conversion success, or NULL
+ * otherwise.
*/
-static unsigned long *str_to_target_ids(const char *str, ssize_t len,
- ssize_t *nr_ids)
+static int *str_to_ints(const char *str, ssize_t len, ssize_t *nr_ints)
{
- unsigned long *ids;
- const int max_nr_ids = 32;
- unsigned long id;
+ int *array;
+ const int max_nr_ints = 32;
+ int nr;
int pos = 0, parsed, ret;
- *nr_ids = 0;
- ids = kmalloc_array(max_nr_ids, sizeof(id), GFP_KERNEL);
- if (!ids)
+ *nr_ints = 0;
+ array = kmalloc_array(max_nr_ints, sizeof(*array), GFP_KERNEL);
+ if (!array)
return NULL;
- while (*nr_ids < max_nr_ids && pos < len) {
- ret = sscanf(&str[pos], "%lu%n", &id, &parsed);
+ while (*nr_ints < max_nr_ints && pos < len) {
+ ret = sscanf(&str[pos], "%d%n", &nr, &parsed);
pos += parsed;
if (ret != 1)
break;
- ids[*nr_ids] = id;
- *nr_ids += 1;
+ array[*nr_ints] = nr;
+ *nr_ints += 1;
}
- return ids;
+ return array;
}
-static void dbgfs_put_pids(unsigned long *ids, int nr_ids)
+static void dbgfs_put_pids(struct pid **pids, int nr_pids)
{
int i;
- for (i = 0; i < nr_ids; i++)
- put_pid((struct pid *)ids[i]);
+ for (i = 0; i < nr_pids; i++)
+ put_pid(pids[i]);
+}
+
+/*
+ * Converts a string into an struct pid pointers array
+ *
+ * Returns an array of struct pid pointers if the conversion success, or NULL
+ * otherwise.
+ */
+static struct pid **str_to_pids(const char *str, ssize_t len, ssize_t *nr_pids)
+{
+ int *ints;
+ ssize_t nr_ints;
+ struct pid **pids;
+
+ *nr_pids = 0;
+
+ ints = str_to_ints(str, len, &nr_ints);
+ if (!ints)
+ return NULL;
+
+ pids = kmalloc_array(nr_ints, sizeof(*pids), GFP_KERNEL);
+ if (!pids)
+ goto out;
+
+ for (; *nr_pids < nr_ints; (*nr_pids)++) {
+ pids[*nr_pids] = find_get_pid(ints[*nr_pids]);
+ if (!pids[*nr_pids]) {
+ dbgfs_put_pids(pids, *nr_pids);
+ kfree(ints);
+ kfree(pids);
+ return NULL;
+ }
+ }
+
+out:
+ kfree(ints);
+ return pids;
+}
+
+/*
+ * dbgfs_set_targets() - Set monitoring targets.
+ * @ctx: monitoring context
+ * @nr_targets: number of targets
+ * @pids: array of target pids (size is same to @nr_targets)
+ *
+ * This function should not be called while the kdamond is running. @pids is
+ * ignored if the context is not configured to have pid in each target. On
+ * failure, reference counts of all pids in @pids are decremented.
+ *
+ * Return: 0 on success, negative error code otherwise.
+ */
+static int dbgfs_set_targets(struct damon_ctx *ctx, ssize_t nr_targets,
+ struct pid **pids)
+{
+ ssize_t i;
+ struct damon_target *t, *next;
+
+ damon_for_each_target_safe(t, next, ctx) {
+ if (target_has_pid(ctx))
+ put_pid(t->pid);
+ damon_destroy_target(t);
+ }
+
+ for (i = 0; i < nr_targets; i++) {
+ t = damon_new_target();
+ if (!t) {
+ damon_for_each_target_safe(t, next, ctx)
+ damon_destroy_target(t);
+ if (target_has_pid(ctx))
+ dbgfs_put_pids(pids, nr_targets);
+ return -ENOMEM;
+ }
+ if (target_has_pid(ctx))
+ t->pid = pids[i];
+ damon_add_target(ctx, t);
+ }
+
+ return 0;
}
static ssize_t dbgfs_target_ids_write(struct file *file,
const char __user *buf, size_t count, loff_t *ppos)
{
struct damon_ctx *ctx = file->private_data;
- struct damon_target *t, *next_t;
bool id_is_pid = true;
char *kbuf;
- unsigned long *targets;
+ struct pid **target_pids = NULL;
ssize_t nr_targets;
ssize_t ret;
- int i;
kbuf = user_input_str(buf, count, ppos);
if (IS_ERR(kbuf))
@@ -376,42 +453,27 @@ static ssize_t dbgfs_target_ids_write(struct file *file,
if (!strncmp(kbuf, "paddr\n", count)) {
id_is_pid = false;
- /* target id is meaningless here, but we set it just for fun */
- scnprintf(kbuf, count, "42 ");
- }
-
- targets = str_to_target_ids(kbuf, count, &nr_targets);
- if (!targets) {
- ret = -ENOMEM;
- goto out;
+ nr_targets = 1;
}
if (id_is_pid) {
- for (i = 0; i < nr_targets; i++) {
- targets[i] = (unsigned long)find_get_pid(
- (int)targets[i]);
- if (!targets[i]) {
- dbgfs_put_pids(targets, i);
- ret = -EINVAL;
- goto free_targets_out;
- }
+ target_pids = str_to_pids(kbuf, count, &nr_targets);
+ if (!target_pids) {
+ ret = -ENOMEM;
+ goto out;
}
}
mutex_lock(&ctx->kdamond_lock);
if (ctx->kdamond) {
if (id_is_pid)
- dbgfs_put_pids(targets, nr_targets);
+ dbgfs_put_pids(target_pids, nr_targets);
ret = -EBUSY;
goto unlock_out;
}
/* remove previously set targets */
- damon_for_each_target_safe(t, next_t, ctx) {
- if (targetid_is_pid(ctx))
- put_pid((struct pid *)t->id);
- damon_destroy_target(t);
- }
+ dbgfs_set_targets(ctx, 0, NULL);
/* Configure the context for the address space type */
if (id_is_pid)
@@ -419,18 +481,13 @@ static ssize_t dbgfs_target_ids_write(struct file *file,
else
damon_pa_set_primitives(ctx);
- ret = damon_set_targets(ctx, targets, nr_targets);
- if (ret) {
- if (id_is_pid)
- dbgfs_put_pids(targets, nr_targets);
- } else {
+ ret = dbgfs_set_targets(ctx, nr_targets, target_pids);
+ if (!ret)
ret = count;
- }
unlock_out:
mutex_unlock(&ctx->kdamond_lock);
-free_targets_out:
- kfree(targets);
+ kfree(target_pids);
out:
kfree(kbuf);
return ret;
@@ -440,18 +497,20 @@ static ssize_t sprint_init_regions(struct damon_ctx *c, char *buf, ssize_t len)
{
struct damon_target *t;
struct damon_region *r;
+ int target_idx = 0;
int written = 0;
int rc;
damon_for_each_target(t, c) {
damon_for_each_region(r, t) {
rc = scnprintf(&buf[written], len - written,
- "%lu %lu %lu\n",
- t->id, r->ar.start, r->ar.end);
+ "%d %lu %lu\n",
+ target_idx, r->ar.start, r->ar.end);
if (!rc)
return -ENOMEM;
written += rc;
}
+ target_idx++;
}
return written;
}
@@ -485,22 +544,19 @@ out:
return len;
}
-static int add_init_region(struct damon_ctx *c,
- unsigned long target_id, struct damon_addr_range *ar)
+static int add_init_region(struct damon_ctx *c, int target_idx,
+ struct damon_addr_range *ar)
{
struct damon_target *t;
struct damon_region *r, *prev;
- unsigned long id;
+ unsigned long idx = 0;
int rc = -EINVAL;
if (ar->start >= ar->end)
return -EINVAL;
damon_for_each_target(t, c) {
- id = t->id;
- if (targetid_is_pid(c))
- id = (unsigned long)pid_vnr((struct pid *)id);
- if (id == target_id) {
+ if (idx++ == target_idx) {
r = damon_new_region(ar->start, ar->end);
if (!r)
return -ENOMEM;
@@ -523,7 +579,7 @@ static int set_init_regions(struct damon_ctx *c, const char *str, ssize_t len)
struct damon_target *t;
struct damon_region *r, *next;
int pos = 0, parsed, ret;
- unsigned long target_id;
+ int target_idx;
struct damon_addr_range ar;
int err;
@@ -533,11 +589,11 @@ static int set_init_regions(struct damon_ctx *c, const char *str, ssize_t len)
}
while (pos < len) {
- ret = sscanf(&str[pos], "%lu %lu %lu%n",
- &target_id, &ar.start, &ar.end, &parsed);
+ ret = sscanf(&str[pos], "%d %lu %lu%n",
+ &target_idx, &ar.start, &ar.end, &parsed);
if (ret != 3)
break;
- err = add_init_region(c, target_id, &ar);
+ err = add_init_region(c, target_idx, &ar);
if (err)
goto fail;
pos += parsed;
@@ -660,12 +716,12 @@ static void dbgfs_before_terminate(struct damon_ctx *ctx)
{
struct damon_target *t, *next;
- if (!targetid_is_pid(ctx))
+ if (!target_has_pid(ctx))
return;
mutex_lock(&ctx->kdamond_lock);
damon_for_each_target_safe(t, next, ctx) {
- put_pid((struct pid *)t->id);
+ put_pid(t->pid);
damon_destroy_target(t);
}
mutex_unlock(&ctx->kdamond_lock);
diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c
index bc476cef688e8..29da37192e4a0 100644
--- a/mm/damon/reclaim.c
+++ b/mm/damon/reclaim.c
@@ -387,8 +387,7 @@ static int __init damon_reclaim_init(void)
damon_pa_set_primitives(ctx);
ctx->callback.after_aggregation = damon_reclaim_after_aggregation;
- /* 4242 means nothing but fun */
- target = damon_new_target(4242);
+ target = damon_new_target();
if (!target) {
damon_destroy_ctx(ctx);
return -ENOMEM;
diff --git a/mm/damon/vaddr-test.h b/mm/damon/vaddr-test.h
index 6a1b9272ea123..f0d0ba591792c 100644
--- a/mm/damon/vaddr-test.h
+++ b/mm/damon/vaddr-test.h
@@ -139,7 +139,7 @@ static void damon_do_test_apply_three_regions(struct kunit *test,
struct damon_region *r;
int i;
- t = damon_new_target(42);
+ t = damon_new_target();
for (i = 0; i < nr_regions / 2; i++) {
r = damon_new_region(regions[i * 2], regions[i * 2 + 1]);
damon_add_region(r, t);
@@ -251,7 +251,7 @@ static void damon_test_apply_three_regions4(struct kunit *test)
static void damon_test_split_evenly_fail(struct kunit *test,
unsigned long start, unsigned long end, unsigned int nr_pieces)
{
- struct damon_target *t = damon_new_target(42);
+ struct damon_target *t = damon_new_target();
struct damon_region *r = damon_new_region(start, end);
damon_add_region(r, t);
@@ -270,7 +270,7 @@ static void damon_test_split_evenly_fail(struct kunit *test,
static void damon_test_split_evenly_succ(struct kunit *test,
unsigned long start, unsigned long end, unsigned int nr_pieces)
{
- struct damon_target *t = damon_new_target(42);
+ struct damon_target *t = damon_new_target();
struct damon_region *r = damon_new_region(start, end);
unsigned long expected_width = (end - start) / nr_pieces;
unsigned long i = 0;
diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c
index 89b6468da2b9b..6d3454dd3204b 100644
--- a/mm/damon/vaddr.c
+++ b/mm/damon/vaddr.c
@@ -23,12 +23,12 @@
#endif
/*
- * 't->id' should be the pointer to the relevant 'struct pid' having reference
+ * 't->pid' should be the pointer to the relevant 'struct pid' having reference
* count. Caller must put the returned task, unless it is NULL.
*/
static inline struct task_struct *damon_get_task_struct(struct damon_target *t)
{
- return get_pid_task((struct pid *)t->id, PIDTYPE_PID);
+ return get_pid_task(t->pid, PIDTYPE_PID);
}
/*
@@ -402,9 +402,6 @@ static void damon_hugetlb_mkold(pte_t *pte, struct mm_struct *mm,
pte_t entry = huge_ptep_get(pte);
struct page *page = pte_page(entry);
- if (!page)
- return;
-
get_page(page);
if (pte_young(entry)) {
@@ -564,9 +561,6 @@ static int damon_young_hugetlb_entry(pte_t *pte, unsigned long hmask,
goto out;
page = pte_page(entry);
- if (!page)
- goto out;
-
get_page(page);
if (pte_young(entry) || !page_is_idle(page) ||
diff --git a/mm/debug.c b/mm/debug.c
index bc9ac87f0e08d..8b43dbb2f17be 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -265,5 +265,4 @@ void page_init_poison(struct page *page, size_t size)
if (page_init_poisoning)
memset(page, PAGE_POISON_PATTERN, size);
}
-EXPORT_SYMBOL_GPL(page_init_poison);
#endif /* CONFIG_DEBUG_VM */
diff --git a/mm/fadvise.c b/mm/fadvise.c
index d6baa4f451c5f..338f160220129 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -109,9 +109,8 @@ int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice)
case POSIX_FADV_NOREUSE:
break;
case POSIX_FADV_DONTNEED:
- if (!inode_write_congested(mapping->host))
- __filemap_fdatawrite_range(mapping, offset, endbyte,
- WB_SYNC_NONE);
+ __filemap_fdatawrite_range(mapping, offset, endbyte,
+ WB_SYNC_NONE);
/*
* First and last FULL page! Partial pages are deliberately
diff --git a/mm/filemap.c b/mm/filemap.c
index ad8c39d90bf94..90afe301cd527 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2229,8 +2229,9 @@ out:
* @nr_pages: The maximum number of pages
* @pages: Where the resulting pages are placed
*
- * find_get_pages_contig() works exactly like find_get_pages(), except
- * that the returned number of pages are guaranteed to be contiguous.
+ * find_get_pages_contig() works exactly like find_get_pages_range(),
+ * except that the returned number of pages are guaranteed to be
+ * contiguous.
*
* Return: the number of pages which were found.
*/
@@ -2290,9 +2291,9 @@ EXPORT_SYMBOL(find_get_pages_contig);
* @nr_pages: the maximum number of pages
* @pages: where the resulting pages are placed
*
- * Like find_get_pages(), except we only return head pages which are tagged
- * with @tag. @index is updated to the index immediately after the last
- * page we return, ready for the next iteration.
+ * Like find_get_pages_range(), except we only return head pages which are
+ * tagged with @tag. @index is updated to the index immediately after the
+ * last page we return, ready for the next iteration.
*
* Return: the number of pages which were found.
*/
diff --git a/mm/gup.c b/mm/gup.c
index a9d4d724aef74..0cf59858114d5 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -464,10 +464,6 @@ static struct page *no_page_table(struct vm_area_struct *vma,
static int follow_pfn_pte(struct vm_area_struct *vma, unsigned long address,
pte_t *pte, unsigned int flags)
{
- /* No page to get reference */
- if (flags & FOLL_GET)
- return -EFAULT;
-
if (flags & FOLL_TOUCH) {
pte_t entry = *pte;
@@ -597,32 +593,6 @@ retry:
*/
mark_page_accessed(page);
}
- if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
- /* Do not mlock pte-mapped THP */
- if (PageTransCompound(page))
- goto out;
-
- /*
- * The preliminary mapping check is mainly to avoid the
- * pointless overhead of lock_page on the ZERO_PAGE
- * which might bounce very badly if there is contention.
- *
- * If the page is already locked, we don't need to
- * handle it now - vmscan will handle it later if and
- * when it attempts to reclaim the page.
- */
- if (page->mapping && trylock_page(page)) {
- lru_add_drain(); /* push cached pages to LRU */
- /*
- * Because we lock page here, and migration is
- * blocked by the pte's page reference, and we
- * know the page is still mapped, we don't even
- * need to check for file-cache page truncation.
- */
- mlock_vma_page(page);
- unlock_page(page);
- }
- }
out:
pte_unmap_unlock(ptep, ptl);
return page;
@@ -945,9 +915,6 @@ static int faultin_page(struct vm_area_struct *vma,
unsigned int fault_flags = 0;
vm_fault_t ret;
- /* mlock all present pages, but do not fault in new pages */
- if ((*flags & (FOLL_POPULATE | FOLL_MLOCK)) == FOLL_MLOCK)
- return -ENOENT;
if (*flags & FOLL_NOFAULT)
return -EFAULT;
if (*flags & FOLL_WRITE)
@@ -1198,15 +1165,20 @@ retry:
case -ENOMEM:
case -EHWPOISON:
goto out;
- case -ENOENT:
- goto next_page;
}
BUG();
} else if (PTR_ERR(page) == -EEXIST) {
/*
* Proper page table entry exists, but no corresponding
- * struct page.
+ * struct page. If the caller expects **pages to be
+ * filled in, bail out now, because that can't be done
+ * for this page.
*/
+ if (pages) {
+ ret = PTR_ERR(page);
+ goto out;
+ }
+
goto next_page;
} else if (IS_ERR(page)) {
ret = PTR_ERR(page);
@@ -1497,9 +1469,14 @@ long populate_vma_page_range(struct vm_area_struct *vma,
VM_BUG_ON_VMA(end > vma->vm_end, vma);
mmap_assert_locked(mm);
- gup_flags = FOLL_TOUCH | FOLL_POPULATE | FOLL_MLOCK;
+ /*
+ * Rightly or wrongly, the VM_LOCKONFAULT case has never used
+ * faultin_page() to break COW, so it has no work to do here.
+ */
if (vma->vm_flags & VM_LOCKONFAULT)
- gup_flags &= ~FOLL_POPULATE;
+ return nr_pages;
+
+ gup_flags = FOLL_TOUCH;
/*
* We want to touch writable mappings with a write fault in order
* to break COW, except for shared mappings because these don't COW
@@ -1566,10 +1543,9 @@ long faultin_vma_page_range(struct vm_area_struct *vma, unsigned long start,
* in the page table.
* FOLL_HWPOISON: Return -EHWPOISON instead of -EFAULT when we hit
* a poisoned page.
- * FOLL_POPULATE: Always populate memory with VM_LOCKONFAULT.
* !FOLL_FORCE: Require proper access permissions.
*/
- gup_flags = FOLL_TOUCH | FOLL_POPULATE | FOLL_MLOCK | FOLL_HWPOISON;
+ gup_flags = FOLL_TOUCH | FOLL_HWPOISON;
if (write)
gup_flags |= FOLL_WRITE;
@@ -1858,6 +1834,69 @@ struct page *get_dump_page(unsigned long addr)
#endif /* CONFIG_ELF_CORE */
#ifdef CONFIG_MIGRATION
+#ifdef CONFIG_DEVICE_PRIVATE
+/*
+ * Migrates a device coherent page back to normal memory. Caller should have a
+ * reference on page which will be copied to the new page if migration is
+ * successful or dropped on failure.
+ */
+static struct page *migrate_device_page(struct page *page,
+ unsigned int gup_flags)
+{
+ struct page *dpage;
+ struct migrate_vma args;
+ unsigned long src_pfn, dst_pfn = 0;
+
+ lock_page(page);
+ src_pfn = migrate_pfn(page_to_pfn(page)) | MIGRATE_PFN_MIGRATE;
+ args.src = &src_pfn;
+ args.dst = &dst_pfn;
+ args.cpages = 1;
+ args.npages = 1;
+ args.vma = NULL;
+ migrate_vma_setup(&args);
+ if (!(src_pfn & MIGRATE_PFN_MIGRATE))
+ return NULL;
+
+ dpage = alloc_pages(GFP_USER | __GFP_NOWARN, 0);
+
+ /*
+ * get/pin the new page now so we don't have to retry gup after
+ * migrating. We already have a reference so this should never fail.
+ */
+ if (dpage && WARN_ON_ONCE(!try_grab_page(dpage, gup_flags))) {
+ __free_pages(dpage, 0);
+ dpage = NULL;
+ }
+
+ if (dpage) {
+ lock_page(dpage);
+ dst_pfn = migrate_pfn(page_to_pfn(dpage));
+ }
+
+ migrate_vma_pages(&args);
+ if (src_pfn & MIGRATE_PFN_MIGRATE)
+ copy_highpage(dpage, page);
+ migrate_vma_finalize(&args);
+ if (dpage && !(src_pfn & MIGRATE_PFN_MIGRATE)) {
+ if (gup_flags & FOLL_PIN)
+ unpin_user_page(dpage);
+ else
+ put_page(dpage);
+ dpage = NULL;
+ }
+
+ return dpage;
+}
+#else
+static inline struct page *migrate_device_page(struct page *page,
+ unsigned int gup_flags)
+{
+ return NULL;
+}
+#endif /* CONFIG_DEVICE_PRIVATE */
+
+
/*
* Check whether all pages are pinnable, if so return number of pages. If some
* pages are not pinnable, migrate them, and unpin all pages. Return zero if
@@ -1886,6 +1925,37 @@ static long check_and_migrate_movable_pages(unsigned long nr_pages,
continue;
prev_head = head;
/*
+ * Device coherent pages are managed by a driver and should not
+ * be pinned indefinitely as it prevents the driver moving the
+ * page. So when trying to pin with FOLL_LONGTERM instead try
+ * migrating page out of device memory.
+ */
+ if (is_dev_private_or_coherent_page(head)) {
+ /*
+ * device private pages will get faulted in during gup
+ * so it shouldn't be possible to see one here.
+ */
+ WARN_ON_ONCE(is_device_private_page(head));
+ WARN_ON_ONCE(PageCompound(head));
+
+ /*
+ * migration will fail if the page is pinned, so convert
+ * the pin on the source page to a normal reference.
+ */
+ if (gup_flags & FOLL_PIN) {
+ get_page(head);
+ unpin_user_page(head);
+ }
+
+ pages[i] = migrate_device_page(head, gup_flags);
+ if (!pages[i]) {
+ ret = -EBUSY;
+ break;
+ }
+ continue;
+ }
+
+ /*
* If we get a movable page, since we are going to be pinning
* these entries, try to move them out if possible.
*/
@@ -1916,15 +1986,22 @@ static long check_and_migrate_movable_pages(unsigned long nr_pages,
* If list is empty, and no isolation errors, means that all pages are
* in the correct zone.
*/
- if (list_empty(&movable_page_list) && !isolation_error_count)
+ if (!ret && list_empty(&movable_page_list) && !isolation_error_count)
return nr_pages;
- if (gup_flags & FOLL_PIN) {
- unpin_user_pages(pages, nr_pages);
- } else {
- for (i = 0; i < nr_pages; i++)
+ for (i = 0; i < nr_pages; i++)
+ if (!pages[i])
+ continue;
+ else if (gup_flags & FOLL_PIN)
+ unpin_user_page(pages[i]);
+ else
put_page(pages[i]);
+
+ if (ret && !list_empty(&movable_page_list)) {
+ putback_movable_pages(&movable_page_list);
+ return ret;
}
+
if (!list_empty(&movable_page_list)) {
ret = migrate_pages(&movable_page_list, alloc_migration_target,
NULL, (unsigned long)&mtc, MIGRATE_SYNC,
@@ -2142,65 +2219,6 @@ long get_user_pages(unsigned long start, unsigned long nr_pages,
}
EXPORT_SYMBOL(get_user_pages);
-/**
- * get_user_pages_locked() - variant of get_user_pages()
- *
- * @start: starting user address
- * @nr_pages: number of pages from start to pin
- * @gup_flags: flags modifying lookup behaviour
- * @pages: array that receives pointers to the pages pinned.
- * Should be at least nr_pages long. Or NULL, if caller
- * only intends to ensure the pages are faulted in.
- * @locked: pointer to lock flag indicating whether lock is held and
- * subsequently whether VM_FAULT_RETRY functionality can be
- * utilised. Lock must initially be held.
- *
- * It is suitable to replace the form:
- *
- * mmap_read_lock(mm);
- * do_something()
- * get_user_pages(mm, ..., pages, NULL);
- * mmap_read_unlock(mm);
- *
- * to:
- *
- * int locked = 1;
- * mmap_read_lock(mm);
- * do_something()
- * get_user_pages_locked(mm, ..., pages, &locked);
- * if (locked)
- * mmap_read_unlock(mm);
- *
- * We can leverage the VM_FAULT_RETRY functionality in the page fault
- * paths better by using either get_user_pages_locked() or
- * get_user_pages_unlocked().
- *
- */
-long get_user_pages_locked(unsigned long start, unsigned long nr_pages,
- unsigned int gup_flags, struct page **pages,
- int *locked)
-{
- /*
- * FIXME: Current FOLL_LONGTERM behavior is incompatible with
- * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on
- * vmas. As there are no users of this flag in this call we simply
- * disallow this option for now.
- */
- if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM))
- return -EINVAL;
- /*
- * FOLL_PIN must only be set internally by the pin_user_pages*() APIs,
- * never directly by the caller, so enforce that:
- */
- if (WARN_ON_ONCE(gup_flags & FOLL_PIN))
- return -EINVAL;
-
- return __get_user_pages_locked(current->mm, start, nr_pages,
- pages, NULL, locked,
- gup_flags | FOLL_TOUCH);
-}
-EXPORT_SYMBOL(get_user_pages_locked);
-
/*
* get_user_pages_unlocked() is suitable to replace the form:
*
@@ -3143,32 +3161,3 @@ long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
return get_user_pages_unlocked(start, nr_pages, pages, gup_flags);
}
EXPORT_SYMBOL(pin_user_pages_unlocked);
-
-/*
- * pin_user_pages_locked() is the FOLL_PIN variant of get_user_pages_locked().
- * Behavior is the same, except that this one sets FOLL_PIN and rejects
- * FOLL_GET.
- */
-long pin_user_pages_locked(unsigned long start, unsigned long nr_pages,
- unsigned int gup_flags, struct page **pages,
- int *locked)
-{
- /*
- * FIXME: Current FOLL_LONGTERM behavior is incompatible with
- * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on
- * vmas. As there are no users of this flag in this call we simply
- * disallow this option for now.
- */
- if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM))
- return -EINVAL;
-
- /* FOLL_GET and FOLL_PIN are mutually exclusive. */
- if (WARN_ON_ONCE(gup_flags & FOLL_GET))
- return -EINVAL;
-
- gup_flags |= FOLL_PIN;
- return __get_user_pages_locked(current->mm, start, nr_pages,
- pages, NULL, locked,
- gup_flags | FOLL_TOUCH);
-}
-EXPORT_SYMBOL(pin_user_pages_locked);
diff --git a/mm/highmem.c b/mm/highmem.c
index 762679050c9a0..0cc0c4da7ed9f 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -736,11 +736,11 @@ void *page_address(const struct page *page)
list_for_each_entry(pam, &pas->lh, list) {
if (pam->page == page) {
ret = pam->virtual;
- goto done;
+ break;
}
}
}
-done:
+
spin_unlock_irqrestore(&pas->lock, flags);
return ret;
}
@@ -773,13 +773,12 @@ void set_page_address(struct page *page, void *virtual)
list_for_each_entry(pam, &pas->lh, list) {
if (pam->page == page) {
list_del(&pam->list);
- spin_unlock_irqrestore(&pas->lock, flags);
- goto done;
+ break;
}
}
spin_unlock_irqrestore(&pas->lock, flags);
}
-done:
+
return;
}
diff --git a/mm/hmm.c b/mm/hmm.c
index bd56641c79d4e..af71aac3140e4 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -417,7 +417,6 @@ static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end,
struct hmm_range *range = hmm_vma_walk->range;
unsigned long addr = start;
pud_t pud;
- int ret = 0;
spinlock_t *ptl = pud_trans_huge_lock(pudp, walk->vma);
if (!ptl)
@@ -466,7 +465,7 @@ static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end,
out_unlock:
spin_unlock(ptl);
- return ret;
+ return 0;
}
#else
#define hmm_vma_walk_pud NULL
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 406a3c28c0266..09fb65a80e636 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -34,11 +34,15 @@
#include <linux/oom.h>
#include <linux/numa.h>
#include <linux/page_owner.h>
+#include <linux/sched/sysctl.h>
#include <asm/tlb.h>
#include <asm/pgalloc.h>
#include "internal.h"
+#define CREATE_TRACE_POINTS
+#include <trace/events/thp.h>
+
/*
* By default, transparent hugepage support is disabled in order to avoid
* risking an increased memory footprint for applications that are not
@@ -1303,7 +1307,6 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
page = pmd_page(orig_pmd);
VM_BUG_ON_PAGE(!PageHead(page), page);
- /* Lock page for reuse_swap_page() */
if (!trylock_page(page)) {
get_page(page);
spin_unlock(vmf->ptl);
@@ -1319,10 +1322,15 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
}
/*
- * We can only reuse the page if nobody else maps the huge page or it's
- * part.
+ * See do_wp_page(): we can only map the page writable if there are
+ * no additional references. Note that we always drain the LRU
+ * pagevecs immediately after adding a THP.
*/
- if (reuse_swap_page(page)) {
+ if (page_count(page) > 1 + PageSwapCache(page) * thp_nr_pages(page))
+ goto unlock_fallback;
+ if (PageSwapCache(page))
+ try_to_free_swap(page);
+ if (page_count(page) == 1) {
pmd_t entry;
entry = pmd_mkyoung(orig_pmd);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
@@ -1333,6 +1341,7 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
return VM_FAULT_WRITE;
}
+unlock_fallback:
unlock_page(page);
spin_unlock(vmf->ptl);
fallback:
@@ -1380,39 +1389,6 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
if (flags & FOLL_TOUCH)
touch_pmd(vma, addr, pmd, flags);
- if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
- /*
- * We don't mlock() pte-mapped THPs. This way we can avoid
- * leaking mlocked pages into non-VM_LOCKED VMAs.
- *
- * For anon THP:
- *
- * In most cases the pmd is the only mapping of the page as we
- * break COW for the mlock() -- see gup_flags |= FOLL_WRITE for
- * writable private mappings in populate_vma_page_range().
- *
- * The only scenario when we have the page shared here is if we
- * mlocking read-only mapping shared over fork(). We skip
- * mlocking such pages.
- *
- * For file THP:
- *
- * We can expect PageDoubleMap() to be stable under page lock:
- * for file pages we set it in page_add_file_rmap(), which
- * requires page to be locked.
- */
-
- if (PageAnon(page) && compound_mapcount(page) != 1)
- goto skip_mlock;
- if (PageDoubleMap(page) || !page->mapping)
- goto skip_mlock;
- if (!trylock_page(page))
- goto skip_mlock;
- if (page->mapping && !PageDoubleMap(page))
- mlock_vma_page(page);
- unlock_page(page);
- }
-skip_mlock:
page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
VM_BUG_ON_PAGE(!PageCompound(page) && !is_zone_device_page(page), page);
@@ -1610,7 +1586,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
if (pmd_present(orig_pmd)) {
page = pmd_page(orig_pmd);
- page_remove_rmap(page, true);
+ page_remove_rmap(page, vma, true);
VM_BUG_ON_PAGE(page_mapcount(page) < 0, page);
VM_BUG_ON_PAGE(!PageHead(page), page);
} else if (thp_migration_supported()) {
@@ -1766,17 +1742,28 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
}
#endif
- /*
- * Avoid trapping faults against the zero page. The read-only
- * data is likely to be read-cached on the local CPU and
- * local/remote hits to the zero page are not interesting.
- */
- if (prot_numa && is_huge_zero_pmd(*pmd))
- goto unlock;
+ if (prot_numa) {
+ struct page *page;
+ /*
+ * Avoid trapping faults against the zero page. The read-only
+ * data is likely to be read-cached on the local CPU and
+ * local/remote hits to the zero page are not interesting.
+ */
+ if (is_huge_zero_pmd(*pmd))
+ goto unlock;
- if (prot_numa && pmd_protnone(*pmd))
- goto unlock;
+ if (pmd_protnone(*pmd))
+ goto unlock;
+ page = pmd_page(*pmd);
+ /*
+ * Skip scanning top tier node if normal numa
+ * balancing is disabled
+ */
+ if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) &&
+ node_is_toptier(page_to_nid(page)))
+ goto unlock;
+ }
/*
* In case prot_numa, we are under mmap_read_lock(mm). It's critical
* to not clear pmd intermittently to avoid race with MADV_DONTNEED
@@ -1995,7 +1982,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
set_page_dirty(page);
if (!PageReferenced(page) && pmd_young(old_pmd))
SetPageReferenced(page);
- page_remove_rmap(page, true);
+ page_remove_rmap(page, vma, true);
put_page(page);
}
add_mm_counter(mm, mm_counter_file(page), -HPAGE_PMD_NR);
@@ -2129,6 +2116,9 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
}
}
unlock_page_memcg(page);
+
+ /* Above is effectively page_remove_rmap(page, vma, true) */
+ munlock_vma_page(page, vma, true);
}
smp_wmb(); /* make pte visible before pmd */
@@ -2136,7 +2126,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
if (freeze) {
for (i = 0; i < HPAGE_PMD_NR; i++) {
- page_remove_rmap(page + i, false);
+ page_remove_rmap(page + i, vma, false);
put_page(page + i);
}
}
@@ -2147,8 +2137,6 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
{
spinlock_t *ptl;
struct mmu_notifier_range range;
- bool do_unlock_page = false;
- pmd_t _pmd;
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
address & HPAGE_PMD_MASK,
@@ -2167,44 +2155,14 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
goto out;
}
-repeat:
if (pmd_trans_huge(*pmd)) {
- if (!page) {
+ if (!page)
page = pmd_page(*pmd);
- /*
- * An anonymous page must be locked, to ensure that a
- * concurrent reuse_swap_page() sees stable mapcount;
- * but reuse_swap_page() is not used on shmem or file,
- * and page lock must not be taken when zap_pmd_range()
- * calls __split_huge_pmd() while i_mmap_lock is held.
- */
- if (PageAnon(page)) {
- if (unlikely(!trylock_page(page))) {
- get_page(page);
- _pmd = *pmd;
- spin_unlock(ptl);
- lock_page(page);
- spin_lock(ptl);
- if (unlikely(!pmd_same(*pmd, _pmd))) {
- unlock_page(page);
- put_page(page);
- page = NULL;
- goto repeat;
- }
- put_page(page);
- }
- do_unlock_page = true;
- }
- }
- if (PageMlocked(page))
- clear_page_mlock(page);
} else if (!(pmd_devmap(*pmd) || is_pmd_migration_entry(*pmd)))
goto out;
__split_huge_pmd_locked(vma, pmd, range.start, freeze);
out:
spin_unlock(ptl);
- if (do_unlock_page)
- unlock_page(page);
/*
* No need to double call mmu_notifier->invalidate_range() callback.
* They are 3 cases to consider inside __split_huge_pmd_locked():
@@ -2332,8 +2290,11 @@ static void lru_add_page_tail(struct page *head, struct page *tail,
} else {
/* head is still on lru (and we have it frozen) */
VM_WARN_ON(!PageLRU(head));
+ if (PageUnevictable(tail))
+ tail->mlock_count = 0;
+ else
+ list_add_tail(&tail->lru, &head->lru);
SetPageLRU(tail);
- list_add_tail(&tail->lru, &head->lru);
}
}
@@ -2518,54 +2479,6 @@ int total_mapcount(struct page *page)
return ret;
}
-/*
- * This calculates accurately how many mappings a transparent hugepage
- * has (unlike page_mapcount() which isn't fully accurate). This full
- * accuracy is primarily needed to know if copy-on-write faults can
- * reuse the page and change the mapping to read-write instead of
- * copying them. At the same time this returns the total_mapcount too.
- *
- * The function returns the highest mapcount any one of the subpages
- * has. If the return value is one, even if different processes are
- * mapping different subpages of the transparent hugepage, they can
- * all reuse it, because each process is reusing a different subpage.
- *
- * The total_mapcount is instead counting all virtual mappings of the
- * subpages. If the total_mapcount is equal to "one", it tells the
- * caller all mappings belong to the same "mm" and in turn the
- * anon_vma of the transparent hugepage can become the vma->anon_vma
- * local one as no other process may be mapping any of the subpages.
- *
- * It would be more accurate to replace page_mapcount() with
- * page_trans_huge_mapcount(), however we only use
- * page_trans_huge_mapcount() in the copy-on-write faults where we
- * need full accuracy to avoid breaking page pinning, because
- * page_trans_huge_mapcount() is slower than page_mapcount().
- */
-int page_trans_huge_mapcount(struct page *page)
-{
- int i, ret;
-
- /* hugetlbfs shouldn't call it */
- VM_BUG_ON_PAGE(PageHuge(page), page);
-
- if (likely(!PageTransCompound(page)))
- return atomic_read(&page->_mapcount) + 1;
-
- page = compound_head(page);
-
- ret = 0;
- for (i = 0; i < thp_nr_pages(page); i++) {
- int mapcount = atomic_read(&page[i]._mapcount) + 1;
- ret = max(ret, mapcount);
- }
-
- if (PageDoubleMap(page))
- ret -= 1;
-
- return ret + compound_mapcount(page);
-}
-
/* Racy check whether the huge page can be split */
bool can_split_huge_page(struct page *page, int *pextra_pins)
{
@@ -3171,8 +3084,9 @@ void set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
if (pmd_soft_dirty(pmdval))
pmdswp = pmd_swp_mksoft_dirty(pmdswp);
set_pmd_at(mm, address, pvmw->pmd, pmdswp);
- page_remove_rmap(page, true);
+ page_remove_rmap(page, vma, true);
put_page(page);
+ trace_set_migration_pmd(address, pmd_val(pmdswp));
}
void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new)
@@ -3197,14 +3111,14 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new)
if (pmd_swp_uffd_wp(*pvmw->pmd))
pmde = pmd_wrprotect(pmd_mkuffd_wp(pmde));
- flush_cache_range(vma, mmun_start, mmun_start + HPAGE_PMD_SIZE);
if (PageAnon(new))
page_add_anon_rmap(new, vma, mmun_start, true);
else
- page_add_file_rmap(new, true);
+ page_add_file_rmap(new, vma, true);
set_pmd_at(mm, mmun_start, pvmw->pmd, pmde);
- if ((vma->vm_flags & VM_LOCKED) && !PageDoubleMap(new))
- mlock_vma_page(new);
+
+ /* No need to invalidate - it was non-present before */
update_mmu_cache_pmd(vma, address, pvmw->pmd);
+ trace_remove_migration_pmd(address, pmd_val(pmde));
}
#endif
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 61895cc01d098..5d9838f9044a8 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -4637,7 +4637,6 @@ static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
vma->vm_page_prot));
}
entry = pte_mkyoung(entry);
- entry = pte_mkhuge(entry);
entry = arch_make_huge_pte(entry, shift, vma->vm_flags);
return entry;
@@ -5014,7 +5013,7 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct
set_page_dirty(page);
hugetlb_count_sub(pages_per_huge_page(h), mm);
- page_remove_rmap(page, true);
+ page_remove_rmap(page, vma, true);
spin_unlock(ptl);
tlb_remove_page_size(tlb, page, huge_page_size(h));
@@ -5259,7 +5258,7 @@ retry_avoidcopy:
/* Break COW */
huge_ptep_clear_flush(vma, haddr, ptep);
mmu_notifier_invalidate_range(mm, range.start, range.end);
- page_remove_rmap(old_page, true);
+ page_remove_rmap(old_page, vma, true);
hugepage_add_new_anon_rmap(new_page, vma, haddr);
set_huge_pte_at(mm, haddr, ptep,
make_huge_pte(vma, new_page, 1));
@@ -5818,7 +5817,8 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
*pagep = NULL;
goto out;
}
- folio_copy(page_folio(page), page_folio(*pagep));
+ copy_user_huge_page(page, *pagep, dst_addr, dst_vma,
+ pages_per_huge_page(h));
put_page(*pagep);
*pagep = NULL;
}
@@ -6172,7 +6172,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
unsigned int shift = huge_page_shift(hstate_vma(vma));
old_pte = huge_ptep_modify_prot_start(vma, address, ptep);
- pte = pte_mkhuge(huge_pte_modify(old_pte, newprot));
+ pte = huge_pte_modify(old_pte, newprot);
pte = arch_make_huge_pte(pte, shift, vma->vm_flags);
huge_ptep_modify_prot_commit(vma, address, ptep, old_pte, pte);
pages++;
diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
index c540c21e26f5b..791626983c2e1 100644
--- a/mm/hugetlb_vmemmap.c
+++ b/mm/hugetlb_vmemmap.c
@@ -124,9 +124,9 @@
* page of page structs (page 0) associated with the HugeTLB page contains the 4
* page structs necessary to describe the HugeTLB. The only use of the remaining
* pages of page structs (page 1 to page 7) is to point to page->compound_head.
- * Therefore, we can remap pages 2 to 7 to page 1. Only 2 pages of page structs
+ * Therefore, we can remap pages 1 to 7 to page 0. Only 1 page of page structs
* will be used for each HugeTLB page. This will allow us to free the remaining
- * 6 pages to the buddy allocator.
+ * 7 pages to the buddy allocator.
*
* Here is how things look after remapping.
*
@@ -134,30 +134,30 @@
* +-----------+ ---virt_to_page---> +-----------+ mapping to +-----------+
* | | | 0 | -------------> | 0 |
* | | +-----------+ +-----------+
- * | | | 1 | -------------> | 1 |
- * | | +-----------+ +-----------+
- * | | | 2 | ----------------^ ^ ^ ^ ^ ^
- * | | +-----------+ | | | | |
- * | | | 3 | ------------------+ | | | |
- * | | +-----------+ | | | |
- * | | | 4 | --------------------+ | | |
- * | PMD | +-----------+ | | |
- * | level | | 5 | ----------------------+ | |
- * | mapping | +-----------+ | |
- * | | | 6 | ------------------------+ |
- * | | +-----------+ |
- * | | | 7 | --------------------------+
+ * | | | 1 | ---------------^ ^ ^ ^ ^ ^ ^
+ * | | +-----------+ | | | | | |
+ * | | | 2 | -----------------+ | | | | |
+ * | | +-----------+ | | | | |
+ * | | | 3 | -------------------+ | | | |
+ * | | +-----------+ | | | |
+ * | | | 4 | ---------------------+ | | |
+ * | PMD | +-----------+ | | |
+ * | level | | 5 | -----------------------+ | |
+ * | mapping | +-----------+ | |
+ * | | | 6 | -------------------------+ |
+ * | | +-----------+ |
+ * | | | 7 | ---------------------------+
* | | +-----------+
* | |
* | |
* | |
* +-----------+
*
- * When a HugeTLB is freed to the buddy system, we should allocate 6 pages for
+ * When a HugeTLB is freed to the buddy system, we should allocate 7 pages for
* vmemmap pages and restore the previous mapping relationship.
*
* For the HugeTLB page of the pud level mapping. It is similar to the former.
- * We also can use this approach to free (PAGE_SIZE - 2) vmemmap pages.
+ * We also can use this approach to free (PAGE_SIZE - 1) vmemmap pages.
*
* Apart from the HugeTLB page of the pmd/pud level mapping, some architectures
* (e.g. aarch64) provides a contiguous bit in the translation table entries
@@ -166,7 +166,13 @@
*
* The contiguous bit is used to increase the mapping size at the pmd and pte
* (last) level. So this type of HugeTLB page can be optimized only when its
- * size of the struct page structs is greater than 2 pages.
+ * size of the struct page structs is greater than 1 page.
+ *
+ * Notice: The head vmemmap page is not freed to the buddy allocator and all
+ * tail vmemmap pages are mapped to the head vmemmap page frame. So we can see
+ * more than one struct page struct with PG_head (e.g. 8 per 2 MB HugeTLB page)
+ * associated with each HugeTLB page. The compound_head() can handle this
+ * correctly (more details refer to the comment above compound_head()).
*/
#define pr_fmt(fmt) "HugeTLB: " fmt
@@ -175,19 +181,21 @@
/*
* There are a lot of struct page structures associated with each HugeTLB page.
* For tail pages, the value of compound_head is the same. So we can reuse first
- * page of tail page structures. We map the virtual addresses of the remaining
- * pages of tail page structures to the first tail page struct, and then free
- * these page frames. Therefore, we need to reserve two pages as vmemmap areas.
+ * page of head page structures. We map the virtual addresses of all the pages
+ * of tail page structures to the head page struct, and then free these page
+ * frames. Therefore, we need to reserve one pages as vmemmap areas.
*/
-#define RESERVE_VMEMMAP_NR 2U
+#define RESERVE_VMEMMAP_NR 1U
#define RESERVE_VMEMMAP_SIZE (RESERVE_VMEMMAP_NR << PAGE_SHIFT)
-bool hugetlb_free_vmemmap_enabled = IS_ENABLED(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON);
+DEFINE_STATIC_KEY_MAYBE(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON,
+ hugetlb_free_vmemmap_enabled_key);
+EXPORT_SYMBOL(hugetlb_free_vmemmap_enabled_key);
static int __init early_hugetlb_free_vmemmap_param(char *buf)
{
/* We cannot optimize if a "struct page" crosses page boundaries. */
- if ((!is_power_of_2(sizeof(struct page)))) {
+ if (!is_power_of_2(sizeof(struct page))) {
pr_warn("cannot free vmemmap pages because \"struct page\" crosses page boundaries\n");
return 0;
}
@@ -196,9 +204,9 @@ static int __init early_hugetlb_free_vmemmap_param(char *buf)
return -EINVAL;
if (!strcmp(buf, "on"))
- hugetlb_free_vmemmap_enabled = true;
+ static_branch_enable(&hugetlb_free_vmemmap_enabled_key);
else if (!strcmp(buf, "off"))
- hugetlb_free_vmemmap_enabled = false;
+ static_branch_disable(&hugetlb_free_vmemmap_enabled_key);
else
return -EINVAL;
@@ -236,7 +244,6 @@ int alloc_huge_page_vmemmap(struct hstate *h, struct page *head)
*/
ret = vmemmap_remap_alloc(vmemmap_addr, vmemmap_end, vmemmap_reuse,
GFP_KERNEL | __GFP_NORETRY | __GFP_THISNODE);
-
if (!ret)
ClearHPageVmemmapOptimized(head);
@@ -277,14 +284,13 @@ void __init hugetlb_vmemmap_init(struct hstate *h)
BUILD_BUG_ON(__NR_USED_SUBPAGE >=
RESERVE_VMEMMAP_SIZE / sizeof(struct page));
- if (!hugetlb_free_vmemmap_enabled)
+ if (!hugetlb_free_vmemmap_enabled())
return;
vmemmap_pages = (nr_pages * sizeof(struct page)) >> PAGE_SHIFT;
/*
- * The head page and the first tail page are not to be freed to buddy
- * allocator, the other pages will map to the first tail page, so they
- * can be freed.
+ * The head page is not to be freed to buddy allocator, the other tail
+ * pages will map to the head page, so they can be freed.
*
* Could RESERVE_VMEMMAP_NR be greater than @vmemmap_pages? It is true
* on some architectures (e.g. aarch64). See Documentation/arm64/
diff --git a/mm/internal.h b/mm/internal.h
index d80300392a194..9a5674bd0a742 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -23,7 +23,7 @@ struct folio_batch;
#define GFP_RECLAIM_MASK (__GFP_RECLAIM|__GFP_HIGH|__GFP_IO|__GFP_FS|\
__GFP_NOWARN|__GFP_RETRY_MAYFAIL|__GFP_NOFAIL|\
__GFP_NORETRY|__GFP_MEMALLOC|__GFP_NOMEMALLOC|\
- __GFP_ATOMIC|__GFP_NOLOCKDEP)
+ __GFP_NOLOCKDEP)
/* The GFP flags allowed during early boot */
#define GFP_BOOT_MASK (__GFP_BITS_MASK & ~(__GFP_RECLAIM|__GFP_IO|__GFP_FS))
@@ -71,11 +71,6 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
unsigned long floor, unsigned long ceiling);
void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte);
-static inline bool can_madv_lru_vma(struct vm_area_struct *vma)
-{
- return !(vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP));
-}
-
struct zap_details;
void unmap_page_range(struct mmu_gather *tlb,
struct vm_area_struct *vma,
@@ -398,32 +393,42 @@ extern long populate_vma_page_range(struct vm_area_struct *vma,
extern long faultin_vma_page_range(struct vm_area_struct *vma,
unsigned long start, unsigned long end,
bool write, int *locked);
-extern void munlock_vma_pages_range(struct vm_area_struct *vma,
- unsigned long start, unsigned long end);
-static inline void munlock_vma_pages_all(struct vm_area_struct *vma)
-{
- munlock_vma_pages_range(vma, vma->vm_start, vma->vm_end);
-}
-
-/*
- * must be called with vma's mmap_lock held for read or write, and page locked.
- */
-extern void mlock_vma_page(struct page *page);
-extern unsigned int munlock_vma_page(struct page *page);
-
extern int mlock_future_check(struct mm_struct *mm, unsigned long flags,
unsigned long len);
-
/*
- * Clear the page's PageMlocked(). This can be useful in a situation where
- * we want to unconditionally remove a page from the pagecache -- e.g.,
- * on truncation or freeing.
+ * mlock_vma_page() and munlock_vma_page():
+ * should be called with vma's mmap_lock held for read or write,
+ * under page table lock for the pte/pmd being added or removed.
*
- * It is legal to call this function for any page, mlocked or not.
- * If called for a page that is still mapped by mlocked vmas, all we do
- * is revert to lazy LRU behaviour -- semantics are not broken.
+ * mlock is usually called at the end of page_add_*_rmap(),
+ * munlock at the end of page_remove_rmap(); but new anon
+ * pages are managed by lru_cache_add_inactive_or_unevictable()
+ * calling mlock_new_page().
+ *
+ * @compound is used to include pmd mappings of THPs, but filter out
+ * pte mappings of THPs, which cannot be consistently counted: a pte
+ * mapping of the THP head cannot be distinguished by the page alone.
*/
-extern void clear_page_mlock(struct page *page);
+void mlock_page(struct page *page);
+static inline void mlock_vma_page(struct page *page,
+ struct vm_area_struct *vma, bool compound)
+{
+ /* VM_IO check prevents migration from double-counting during mlock */
+ if (unlikely((vma->vm_flags & (VM_LOCKED|VM_IO)) == VM_LOCKED) &&
+ (compound || !PageTransCompound(page)))
+ mlock_page(page);
+}
+void munlock_page(struct page *page);
+static inline void munlock_vma_page(struct page *page,
+ struct vm_area_struct *vma, bool compound)
+{
+ if (unlikely(vma->vm_flags & VM_LOCKED) &&
+ (compound || !PageTransCompound(page)))
+ munlock_page(page);
+}
+void mlock_new_page(struct page *page);
+bool need_mlock_page_drain(int cpu);
+void mlock_page_drain(int cpu);
extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma);
@@ -498,8 +503,13 @@ static inline struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf,
}
#else /* !CONFIG_MMU */
static inline void unmap_mapping_folio(struct folio *folio) { }
-static inline void clear_page_mlock(struct page *page) { }
-static inline void mlock_vma_page(struct page *page) { }
+static inline void mlock_vma_page(struct page *page,
+ struct vm_area_struct *vma, bool compound) { }
+static inline void munlock_vma_page(struct page *page,
+ struct vm_area_struct *vma, bool compound) { }
+static inline void mlock_new_page(struct page *page) { }
+static inline bool need_mlock_page_drain(int cpu) { return false; }
+static inline void mlock_page_drain(int cpu) { }
static inline void vunmap_range_noflush(unsigned long start, unsigned long end)
{
}
@@ -572,17 +582,6 @@ static inline void mminit_verify_zonelist(void)
}
#endif /* CONFIG_DEBUG_MEMORY_INIT */
-/* mminit_validate_memmodel_limits is independent of CONFIG_DEBUG_MEMORY_INIT */
-#if defined(CONFIG_SPARSEMEM)
-extern void mminit_validate_memmodel_limits(unsigned long *start_pfn,
- unsigned long *end_pfn);
-#else
-static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn,
- unsigned long *end_pfn)
-{
-}
-#endif /* CONFIG_SPARSEMEM */
-
#define NODE_RECLAIM_NOSCAN -2
#define NODE_RECLAIM_FULL -1
#define NODE_RECLAIM_SOME 0
@@ -718,4 +717,6 @@ void vunmap_range_noflush(unsigned long start, unsigned long end);
int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
unsigned long addr, int page_nid, int *flags);
+DECLARE_PER_CPU(struct per_cpu_nodestat, boot_nodestats);
+
#endif /* __MM_INTERNAL_H */
diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index 92196562687b6..d9079ec11f313 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -387,7 +387,7 @@ static inline bool ____kasan_kfree_large(void *ptr, unsigned long ip)
}
/*
- * The object will be poisoned by kasan_free_pages() or
+ * The object will be poisoned by kasan_poison_pages() or
* kasan_slab_free_mempool().
*/
@@ -538,7 +538,7 @@ void * __must_check __kasan_kmalloc_large(const void *ptr, size_t size,
return NULL;
/*
- * The object has already been unpoisoned by kasan_alloc_pages() for
+ * The object has already been unpoisoned by kasan_unpoison_pages() for
* alloc_pages() or by kasan_krealloc() for krealloc().
*/
diff --git a/mm/kasan/hw_tags.c b/mm/kasan/hw_tags.c
index 7355cb534e4f8..fad1887e54c05 100644
--- a/mm/kasan/hw_tags.c
+++ b/mm/kasan/hw_tags.c
@@ -32,6 +32,12 @@ enum kasan_arg_mode {
KASAN_ARG_MODE_ASYMM,
};
+enum kasan_arg_vmalloc {
+ KASAN_ARG_VMALLOC_DEFAULT,
+ KASAN_ARG_VMALLOC_OFF,
+ KASAN_ARG_VMALLOC_ON,
+};
+
enum kasan_arg_stacktrace {
KASAN_ARG_STACKTRACE_DEFAULT,
KASAN_ARG_STACKTRACE_OFF,
@@ -40,18 +46,28 @@ enum kasan_arg_stacktrace {
static enum kasan_arg kasan_arg __ro_after_init;
static enum kasan_arg_mode kasan_arg_mode __ro_after_init;
-static enum kasan_arg_stacktrace kasan_arg_stacktrace __ro_after_init;
+static enum kasan_arg_vmalloc kasan_arg_vmalloc __initdata;
+static enum kasan_arg_stacktrace kasan_arg_stacktrace __initdata;
-/* Whether KASAN is enabled at all. */
+/*
+ * Whether KASAN is enabled at all.
+ * The value remains false until KASAN is initialized by kasan_init_hw_tags().
+ */
DEFINE_STATIC_KEY_FALSE(kasan_flag_enabled);
EXPORT_SYMBOL(kasan_flag_enabled);
-/* Whether the selected mode is synchronous/asynchronous/asymmetric.*/
+/*
+ * Whether the selected mode is synchronous, asynchronous, or asymmetric.
+ * Defaults to KASAN_MODE_SYNC.
+ */
enum kasan_mode kasan_mode __ro_after_init;
EXPORT_SYMBOL_GPL(kasan_mode);
+/* Whether to enable vmalloc tagging. */
+DEFINE_STATIC_KEY_TRUE(kasan_flag_vmalloc);
+
/* Whether to collect alloc/free stack traces. */
-DEFINE_STATIC_KEY_FALSE(kasan_flag_stacktrace);
+DEFINE_STATIC_KEY_TRUE(kasan_flag_stacktrace);
/* kasan=off/on */
static int __init early_kasan_flag(char *arg)
@@ -89,6 +105,23 @@ static int __init early_kasan_mode(char *arg)
}
early_param("kasan.mode", early_kasan_mode);
+/* kasan.vmalloc=off/on */
+static int __init early_kasan_flag_vmalloc(char *arg)
+{
+ if (!arg)
+ return -EINVAL;
+
+ if (!strcmp(arg, "off"))
+ kasan_arg_vmalloc = KASAN_ARG_VMALLOC_OFF;
+ else if (!strcmp(arg, "on"))
+ kasan_arg_vmalloc = KASAN_ARG_VMALLOC_ON;
+ else
+ return -EINVAL;
+
+ return 0;
+}
+early_param("kasan.vmalloc", early_kasan_flag_vmalloc);
+
/* kasan.stacktrace=off/on */
static int __init early_kasan_flag_stacktrace(char *arg)
{
@@ -116,7 +149,10 @@ static inline const char *kasan_mode_info(void)
return "sync";
}
-/* kasan_init_hw_tags_cpu() is called for each CPU. */
+/*
+ * kasan_init_hw_tags_cpu() is called for each CPU.
+ * Not marked as __init as a CPU can be hot-plugged after boot.
+ */
void kasan_init_hw_tags_cpu(void)
{
/*
@@ -124,7 +160,11 @@ void kasan_init_hw_tags_cpu(void)
* as this function is only called for MTE-capable hardware.
*/
- /* If KASAN is disabled via command line, don't initialize it. */
+ /*
+ * If KASAN is disabled via command line, don't initialize it.
+ * When this function is called, kasan_flag_enabled is not yet
+ * set by kasan_init_hw_tags(). Thus, check kasan_arg instead.
+ */
if (kasan_arg == KASAN_ARG_OFF)
return;
@@ -151,79 +191,156 @@ void __init kasan_init_hw_tags(void)
if (kasan_arg == KASAN_ARG_OFF)
return;
- /* Enable KASAN. */
- static_branch_enable(&kasan_flag_enabled);
-
switch (kasan_arg_mode) {
case KASAN_ARG_MODE_DEFAULT:
- /*
- * Default to sync mode.
- */
- fallthrough;
+ /* Default is specified by kasan_mode definition. */
+ break;
case KASAN_ARG_MODE_SYNC:
- /* Sync mode enabled. */
kasan_mode = KASAN_MODE_SYNC;
break;
case KASAN_ARG_MODE_ASYNC:
- /* Async mode enabled. */
kasan_mode = KASAN_MODE_ASYNC;
break;
case KASAN_ARG_MODE_ASYMM:
- /* Asymm mode enabled. */
kasan_mode = KASAN_MODE_ASYMM;
break;
}
+ switch (kasan_arg_vmalloc) {
+ case KASAN_ARG_VMALLOC_DEFAULT:
+ /* Default is specified by kasan_flag_vmalloc definition. */
+ break;
+ case KASAN_ARG_VMALLOC_OFF:
+ static_branch_disable(&kasan_flag_vmalloc);
+ break;
+ case KASAN_ARG_VMALLOC_ON:
+ static_branch_enable(&kasan_flag_vmalloc);
+ break;
+ }
+
switch (kasan_arg_stacktrace) {
case KASAN_ARG_STACKTRACE_DEFAULT:
- /* Default to enabling stack trace collection. */
- static_branch_enable(&kasan_flag_stacktrace);
+ /* Default is specified by kasan_flag_stacktrace definition. */
break;
case KASAN_ARG_STACKTRACE_OFF:
- /* Do nothing, kasan_flag_stacktrace keeps its default value. */
+ static_branch_disable(&kasan_flag_stacktrace);
break;
case KASAN_ARG_STACKTRACE_ON:
static_branch_enable(&kasan_flag_stacktrace);
break;
}
- pr_info("KernelAddressSanitizer initialized (hw-tags, mode=%s, stacktrace=%s)\n",
+ /* KASAN is now initialized, enable it. */
+ static_branch_enable(&kasan_flag_enabled);
+
+ pr_info("KernelAddressSanitizer initialized (hw-tags, mode=%s, vmalloc=%s, stacktrace=%s)\n",
kasan_mode_info(),
+ kasan_vmalloc_enabled() ? "on" : "off",
kasan_stack_collection_enabled() ? "on" : "off");
}
-void kasan_alloc_pages(struct page *page, unsigned int order, gfp_t flags)
+#ifdef CONFIG_KASAN_VMALLOC
+
+static void unpoison_vmalloc_pages(const void *addr, u8 tag)
{
+ struct vm_struct *area;
+ int i;
+
/*
- * This condition should match the one in post_alloc_hook() in
- * page_alloc.c.
+ * As hardware tag-based KASAN only tags VM_ALLOC vmalloc allocations
+ * (see the comment in __kasan_unpoison_vmalloc), all of the pages
+ * should belong to a single area.
*/
- bool init = !want_init_on_free() && want_init_on_alloc(flags);
-
- if (flags & __GFP_SKIP_KASAN_POISON)
- SetPageSkipKASanPoison(page);
+ area = find_vm_area((void *)addr);
+ if (WARN_ON(!area))
+ return;
- if (flags & __GFP_ZEROTAGS) {
- int i;
+ for (i = 0; i < area->nr_pages; i++) {
+ struct page *page = area->pages[i];
- for (i = 0; i != 1 << order; ++i)
- tag_clear_highpage(page + i);
- } else {
- kasan_unpoison_pages(page, order, init);
+ page_kasan_tag_set(page, tag);
}
}
-void kasan_free_pages(struct page *page, unsigned int order)
+void *__kasan_unpoison_vmalloc(const void *start, unsigned long size,
+ kasan_vmalloc_flags_t flags)
{
+ u8 tag;
+ unsigned long redzone_start, redzone_size;
+
+ if (!kasan_vmalloc_enabled())
+ return (void *)start;
+
+ if (!is_vmalloc_or_module_addr(start))
+ return (void *)start;
+
+ /*
+ * Skip unpoisoning and assigning a pointer tag for non-VM_ALLOC
+ * mappings as:
+ *
+ * 1. Unlike the software KASAN modes, hardware tag-based KASAN only
+ * supports tagging physical memory. Therefore, it can only tag a
+ * single mapping of normal physical pages.
+ * 2. Hardware tag-based KASAN can only tag memory mapped with special
+ * mapping protection bits, see arch_vmalloc_pgprot_modify().
+ * As non-VM_ALLOC mappings can be mapped outside of vmalloc code,
+ * providing these bits would require tracking all non-VM_ALLOC
+ * mappers.
+ *
+ * Thus, for VM_ALLOC mappings, hardware tag-based KASAN only tags
+ * the first virtual mapping, which is created by vmalloc().
+ * Tagging the page_alloc memory backing that vmalloc() allocation is
+ * skipped, see ___GFP_SKIP_KASAN_UNPOISON.
+ *
+ * For non-VM_ALLOC allocations, page_alloc memory is tagged as usual.
+ */
+ if (!(flags & KASAN_VMALLOC_VM_ALLOC))
+ return (void *)start;
+
/*
- * This condition should match the one in free_pages_prepare() in
- * page_alloc.c.
+ * Don't tag executable memory.
+ * The kernel doesn't tolerate having the PC register tagged.
*/
- bool init = want_init_on_free();
+ if (!(flags & KASAN_VMALLOC_PROT_NORMAL))
+ return (void *)start;
- kasan_poison_pages(page, order, init);
+ tag = kasan_random_tag();
+ start = set_tag(start, tag);
+
+ /* Unpoison and initialize memory up to size. */
+ kasan_unpoison(start, size, flags & KASAN_VMALLOC_INIT);
+
+ /*
+ * Explicitly poison and initialize the in-page vmalloc() redzone.
+ * Unlike software KASAN modes, hardware tag-based KASAN doesn't
+ * unpoison memory when populating shadow for vmalloc() space.
+ */
+ redzone_start = round_up((unsigned long)start + size,
+ KASAN_GRANULE_SIZE);
+ redzone_size = round_up(redzone_start, PAGE_SIZE) - redzone_start;
+ kasan_poison((void *)redzone_start, redzone_size, KASAN_TAG_INVALID,
+ flags & KASAN_VMALLOC_INIT);
+
+ /*
+ * Set per-page tag flags to allow accessing physical memory for the
+ * vmalloc() mapping through page_address(vmalloc_to_page()).
+ */
+ unpoison_vmalloc_pages(start, tag);
+
+ return (void *)start;
}
+void __kasan_poison_vmalloc(const void *start, unsigned long size)
+{
+ /*
+ * No tagging here.
+ * The physical pages backing the vmalloc() allocation are poisoned
+ * through the usual page_alloc paths.
+ */
+}
+
+#endif
+
#if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST)
void kasan_enable_tagging_sync(void)
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index c17fa8d26ffe5..4d67408e84076 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -12,7 +12,8 @@
#include <linux/static_key.h>
#include "../slab.h"
-DECLARE_STATIC_KEY_FALSE(kasan_flag_stacktrace);
+DECLARE_STATIC_KEY_TRUE(kasan_flag_vmalloc);
+DECLARE_STATIC_KEY_TRUE(kasan_flag_stacktrace);
enum kasan_mode {
KASAN_MODE_SYNC,
@@ -22,6 +23,11 @@ enum kasan_mode {
extern enum kasan_mode kasan_mode __ro_after_init;
+static inline bool kasan_vmalloc_enabled(void)
+{
+ return static_branch_likely(&kasan_flag_vmalloc);
+}
+
static inline bool kasan_stack_collection_enabled(void)
{
return static_branch_unlikely(&kasan_flag_stacktrace);
@@ -71,17 +77,19 @@ static inline bool kasan_sync_fault_possible(void)
#define KASAN_PAGE_REDZONE 0xFE /* redzone for kmalloc_large allocations */
#define KASAN_KMALLOC_REDZONE 0xFC /* redzone inside slub object */
#define KASAN_KMALLOC_FREE 0xFB /* object was freed (kmem_cache_free/kfree) */
-#define KASAN_KMALLOC_FREETRACK 0xFA /* object was freed and has free track set */
+#define KASAN_VMALLOC_INVALID 0xF8 /* unallocated space in vmapped page */
#else
#define KASAN_FREE_PAGE KASAN_TAG_INVALID
#define KASAN_PAGE_REDZONE KASAN_TAG_INVALID
#define KASAN_KMALLOC_REDZONE KASAN_TAG_INVALID
#define KASAN_KMALLOC_FREE KASAN_TAG_INVALID
-#define KASAN_KMALLOC_FREETRACK KASAN_TAG_INVALID
+#define KASAN_VMALLOC_INVALID KASAN_TAG_INVALID /* only for SW_TAGS */
#endif
+#ifdef CONFIG_KASAN_GENERIC
+
+#define KASAN_KMALLOC_FREETRACK 0xFA /* object was freed and has free track set */
#define KASAN_GLOBAL_REDZONE 0xF9 /* redzone for global variable */
-#define KASAN_VMALLOC_INVALID 0xF8 /* unallocated space in vmapped page */
/*
* Stack redzone shadow values
@@ -110,6 +118,8 @@ static inline bool kasan_sync_fault_possible(void)
#define KASAN_ABI_VERSION 1
#endif
+#endif /* CONFIG_KASAN_GENERIC */
+
/* Metadata layout customization. */
#define META_BYTES_PER_BLOCK 1
#define META_BLOCKS_PER_ROW 16
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index 3ad9624dcc561..f14146563d412 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -117,16 +117,8 @@ static void end_report(unsigned long *flags, unsigned long addr)
pr_err("==================================================================\n");
add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
spin_unlock_irqrestore(&report_lock, *flags);
- if (panic_on_warn && !test_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags)) {
- /*
- * This thread may hit another WARN() in the panic path.
- * Resetting this prevents additional WARN() from panicking the
- * system on this thread. Other threads are blocked by the
- * panic_mutex in panic().
- */
- panic_on_warn = 0;
+ if (panic_on_warn && !test_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags))
panic("panic_on_warn set ...\n");
- }
if (kasan_arg_fault == KASAN_ARG_FAULT_PANIC)
panic("kasan.fault=panic set ...\n");
kasan_enable_current();
diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c
index 94136f84b4497..7272e248db87d 100644
--- a/mm/kasan/shadow.c
+++ b/mm/kasan/shadow.c
@@ -345,27 +345,6 @@ int kasan_populate_vmalloc(unsigned long addr, unsigned long size)
return 0;
}
-/*
- * Poison the shadow for a vmalloc region. Called as part of the
- * freeing process at the time the region is freed.
- */
-void kasan_poison_vmalloc(const void *start, unsigned long size)
-{
- if (!is_vmalloc_or_module_addr(start))
- return;
-
- size = round_up(size, KASAN_GRANULE_SIZE);
- kasan_poison(start, size, KASAN_VMALLOC_INVALID, false);
-}
-
-void kasan_unpoison_vmalloc(const void *start, unsigned long size)
-{
- if (!is_vmalloc_or_module_addr(start))
- return;
-
- kasan_unpoison(start, size, false);
-}
-
static int kasan_depopulate_vmalloc_pte(pte_t *ptep, unsigned long addr,
void *unused)
{
@@ -496,9 +475,47 @@ void kasan_release_vmalloc(unsigned long start, unsigned long end,
}
}
+void *__kasan_unpoison_vmalloc(const void *start, unsigned long size,
+ kasan_vmalloc_flags_t flags)
+{
+ /*
+ * Software KASAN modes unpoison both VM_ALLOC and non-VM_ALLOC
+ * mappings, so the KASAN_VMALLOC_VM_ALLOC flag is ignored.
+ * Software KASAN modes can't optimize zeroing memory by combining it
+ * with setting memory tags, so the KASAN_VMALLOC_INIT flag is ignored.
+ */
+
+ if (!is_vmalloc_or_module_addr(start))
+ return (void *)start;
+
+ /*
+ * Don't tag executable memory.
+ * The kernel doesn't tolerate having the PC register tagged.
+ */
+ if (!(flags & KASAN_VMALLOC_PROT_NORMAL))
+ return (void *)start;
+
+ start = set_tag(start, kasan_random_tag());
+ kasan_unpoison(start, size, false);
+ return (void *)start;
+}
+
+/*
+ * Poison the shadow for a vmalloc region. Called as part of the
+ * freeing process at the time the region is freed.
+ */
+void __kasan_poison_vmalloc(const void *start, unsigned long size)
+{
+ if (!is_vmalloc_or_module_addr(start))
+ return;
+
+ size = round_up(size, KASAN_GRANULE_SIZE);
+ kasan_poison(start, size, KASAN_VMALLOC_INVALID, false);
+}
+
#else /* CONFIG_KASAN_VMALLOC */
-int kasan_module_alloc(void *addr, size_t size, gfp_t gfp_mask)
+int kasan_alloc_module_shadow(void *addr, size_t size, gfp_t gfp_mask)
{
void *ret;
size_t scaled_size;
@@ -534,7 +551,7 @@ int kasan_module_alloc(void *addr, size_t size, gfp_t gfp_mask)
return -ENOMEM;
}
-void kasan_free_shadow(const struct vm_struct *vm)
+void kasan_free_module_shadow(const struct vm_struct *vm)
{
if (vm->flags & VM_KASAN)
vfree(kasan_mem_to_shadow(vm->addr));
diff --git a/mm/kfence/core.c b/mm/kfence/core.c
index 5ad40e3add457..13128fa130625 100644
--- a/mm/kfence/core.c
+++ b/mm/kfence/core.c
@@ -47,7 +47,8 @@
static bool kfence_enabled __read_mostly;
-static unsigned long kfence_sample_interval __read_mostly = CONFIG_KFENCE_SAMPLE_INTERVAL;
+unsigned long kfence_sample_interval __read_mostly = CONFIG_KFENCE_SAMPLE_INTERVAL;
+EXPORT_SYMBOL_GPL(kfence_sample_interval); /* Export for test modules. */
#ifdef MODULE_PARAM_PREFIX
#undef MODULE_PARAM_PREFIX
diff --git a/mm/kfence/kfence_test.c b/mm/kfence/kfence_test.c
index a22b1af85577d..50dbb815a2a83 100644
--- a/mm/kfence/kfence_test.c
+++ b/mm/kfence/kfence_test.c
@@ -268,13 +268,13 @@ static void *test_alloc(struct kunit *test, size_t size, gfp_t gfp, enum allocat
* 100x the sample interval should be more than enough to ensure we get
* a KFENCE allocation eventually.
*/
- timeout = jiffies + msecs_to_jiffies(100 * CONFIG_KFENCE_SAMPLE_INTERVAL);
+ timeout = jiffies + msecs_to_jiffies(100 * kfence_sample_interval);
/*
* Especially for non-preemption kernels, ensure the allocation-gate
* timer can catch up: after @resched_after, every failed allocation
* attempt yields, to ensure the allocation-gate timer is scheduled.
*/
- resched_after = jiffies + msecs_to_jiffies(CONFIG_KFENCE_SAMPLE_INTERVAL);
+ resched_after = jiffies + msecs_to_jiffies(kfence_sample_interval);
do {
if (test_cache)
alloc = kmem_cache_alloc(test_cache, gfp);
@@ -608,7 +608,7 @@ static void test_gfpzero(struct kunit *test)
int i;
/* Skip if we think it'd take too long. */
- KFENCE_TEST_REQUIRES(test, CONFIG_KFENCE_SAMPLE_INTERVAL <= 100);
+ KFENCE_TEST_REQUIRES(test, kfence_sample_interval <= 100);
setup_test_cache(test, size, 0, NULL);
buf1 = test_alloc(test, size, GFP_KERNEL, ALLOCATE_ANY);
@@ -739,7 +739,7 @@ static void test_memcache_alloc_bulk(struct kunit *test)
* 100x the sample interval should be more than enough to ensure we get
* a KFENCE allocation eventually.
*/
- timeout = jiffies + msecs_to_jiffies(100 * CONFIG_KFENCE_SAMPLE_INTERVAL);
+ timeout = jiffies + msecs_to_jiffies(100 * kfence_sample_interval);
do {
void *objects[100];
int i, num = kmem_cache_alloc_bulk(test_cache, GFP_ATOMIC, ARRAY_SIZE(objects),
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 131492fd1148b..7d45d463acf55 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -46,7 +46,6 @@ enum scan_result {
SCAN_VMA_NULL,
SCAN_VMA_CHECK,
SCAN_ADDRESS_RANGE,
- SCAN_SWAP_CACHE_PAGE,
SCAN_DEL_PAGE_LRU,
SCAN_ALLOC_HUGE_PAGE_FAIL,
SCAN_CGROUP_CHARGE_FAIL,
@@ -683,16 +682,6 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
result = SCAN_PAGE_COUNT;
goto out;
}
- if (!pte_write(pteval) && PageSwapCache(page) &&
- !reuse_swap_page(page)) {
- /*
- * Page is in the swap cache and cannot be re-used.
- * It cannot be collapsed into a THP.
- */
- unlock_page(page);
- result = SCAN_SWAP_CACHE_PAGE;
- goto out;
- }
/*
* Isolate the page to avoid collapsing an hugepage
@@ -774,7 +763,7 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
*/
spin_lock(ptl);
ptep_clear(vma->vm_mm, address, _pte);
- page_remove_rmap(src_page, false);
+ page_remove_rmap(src_page, vma, false);
spin_unlock(ptl);
free_page_and_swap_cache(src_page);
}
@@ -1513,7 +1502,7 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
if (pte_none(*pte))
continue;
page = vm_normal_page(vma, addr, *pte);
- page_remove_rmap(page, false);
+ page_remove_rmap(page, vma, false);
}
pte_unmap_unlock(start_pte, ptl);
@@ -1834,13 +1823,12 @@ static void collapse_file(struct mm_struct *mm,
}
if (page_mapped(page))
- unmap_mapping_pages(mapping, index, 1, false);
+ try_to_unmap(page, TTU_IGNORE_MLOCK | TTU_BATCH_FLUSH);
xas_lock_irq(&xas);
xas_set(&xas, index);
VM_BUG_ON_PAGE(page != xas_load(&xas), page);
- VM_BUG_ON_PAGE(page_mapped(page), page);
/*
* The page is expected to have page_count() == 3:
@@ -1904,6 +1892,13 @@ xa_locked:
xas_unlock_irq(&xas);
xa_unlocked:
+ /*
+ * If collapse is successful, flush must be done now before copying.
+ * If collapse is unsuccessful, does flush actually need to be done?
+ * Do it anyway, to clear the state.
+ */
+ try_to_unmap_flush();
+
if (result == SCAN_SUCCEED) {
struct page *page, *tmp;
diff --git a/mm/ksm.c b/mm/ksm.c
index c20bd4d9a0d9e..e246d650266ac 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1177,7 +1177,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
ptep_clear_flush(vma, addr, ptep);
set_pte_at_notify(mm, addr, ptep, newpte);
- page_remove_rmap(page, false);
+ page_remove_rmap(page, vma, false);
if (!page_mapped(page))
try_to_free_swap(page);
put_page(page);
@@ -1252,16 +1252,6 @@ static int try_to_merge_one_page(struct vm_area_struct *vma,
err = replace_page(vma, page, kpage, orig_pte);
}
- if ((vma->vm_flags & VM_LOCKED) && kpage && !err) {
- munlock_vma_page(page);
- if (!PageMlocked(kpage)) {
- unlock_page(page);
- lock_page(kpage);
- mlock_vma_page(kpage);
- page = kpage; /* for final unlock */
- }
- }
-
out_unlock:
unlock_page(page);
out:
@@ -2595,6 +2585,9 @@ struct page *ksm_might_need_to_copy(struct page *page,
SetPageDirty(new_page);
__SetPageUptodate(new_page);
__SetPageLocked(new_page);
+#ifdef CONFIG_SWAP
+ count_vm_event(KSM_SWPIN_COPY);
+#endif
}
return new_page;
diff --git a/mm/maccess.c b/mm/maccess.c
index d3f1a1f0b1c1a..de43f29c52f0a 100644
--- a/mm/maccess.c
+++ b/mm/maccess.c
@@ -335,3 +335,9 @@ long strnlen_user_nofault(const void __user *unsafe_addr, long count)
return ret;
}
+
+void copy_overflow(int size, unsigned long count)
+{
+ WARN(1, "Buffer overflow detected (%d < %lu)!\n", size, count);
+}
+EXPORT_SYMBOL_GPL(copy_overflow);
diff --git a/mm/madvise.c b/mm/madvise.c
index 5604064df4646..ae35d72627efa 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -530,6 +530,11 @@ static void madvise_cold_page_range(struct mmu_gather *tlb,
tlb_end_vma(tlb, vma);
}
+static inline bool can_madv_lru_vma(struct vm_area_struct *vma)
+{
+ return !(vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP));
+}
+
static long madvise_cold(struct vm_area_struct *vma,
struct vm_area_struct **prev,
unsigned long start_addr, unsigned long end_addr)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 09d342c7cbd0d..a0e9d9f12cf5b 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -254,7 +254,7 @@ struct mem_cgroup *vmpressure_to_memcg(struct vmpressure *vmpr)
}
#ifdef CONFIG_MEMCG_KMEM
-extern spinlock_t css_set_lock;
+static DEFINE_SPINLOCK(objcg_lock);
bool mem_cgroup_kmem_disabled(void)
{
@@ -298,9 +298,9 @@ static void obj_cgroup_release(struct percpu_ref *ref)
if (nr_pages)
obj_cgroup_uncharge_pages(objcg, nr_pages);
- spin_lock_irqsave(&css_set_lock, flags);
+ spin_lock_irqsave(&objcg_lock, flags);
list_del(&objcg->list);
- spin_unlock_irqrestore(&css_set_lock, flags);
+ spin_unlock_irqrestore(&objcg_lock, flags);
percpu_ref_exit(ref);
kfree_rcu(objcg, rcu);
@@ -332,7 +332,7 @@ static void memcg_reparent_objcgs(struct mem_cgroup *memcg,
objcg = rcu_replace_pointer(memcg->objcg, NULL, true);
- spin_lock_irq(&css_set_lock);
+ spin_lock_irq(&objcg_lock);
/* 1) Ready to reparent active objcg. */
list_add(&objcg->list, &memcg->objcg_list);
@@ -342,7 +342,7 @@ static void memcg_reparent_objcgs(struct mem_cgroup *memcg,
/* 3) Move already reparented objcgs to the parent's list */
list_splice(&memcg->objcg_list, &parent->objcg_list);
- spin_unlock_irq(&css_set_lock);
+ spin_unlock_irq(&objcg_lock);
percpu_ref_kill(&objcg->refcnt);
}
@@ -1257,8 +1257,7 @@ struct lruvec *folio_lruvec_lock_irqsave(struct folio *folio,
* @nr_pages: positive when adding or negative when removing
*
* This function must be called under lru_lock, just before a page is added
- * to or just after a page is removed from an lru list (that ordering being
- * so as to allow it to check that lru_size 0 is consistent with list_empty).
+ * to or just after a page is removed from an lru list.
*/
void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
int zid, int nr_pages)
@@ -1371,6 +1370,7 @@ struct memory_stat {
static const struct memory_stat memory_stats[] = {
{ "anon", NR_ANON_MAPPED },
{ "file", NR_FILE_PAGES },
+ { "kernel", MEMCG_KMEM },
{ "kernel_stack", NR_KERNEL_STACK_KB },
{ "pagetables", NR_PAGETABLE },
{ "percpu", MEMCG_PERCPU_B },
@@ -2114,6 +2114,7 @@ static DEFINE_MUTEX(percpu_charge_mutex);
static void drain_obj_stock(struct obj_stock *stock);
static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
struct mem_cgroup *root_memcg);
+static void memcg_account_kmem(struct mem_cgroup *memcg, int nr_pages);
#else
static inline void drain_obj_stock(struct obj_stock *stock)
@@ -2124,6 +2125,9 @@ static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
{
return false;
}
+static void memcg_account_kmem(struct mem_cgroup *memcg, int nr_pages)
+{
+}
#endif
/**
@@ -2575,7 +2579,7 @@ retry:
* put the burden of reclaim on regular allocation requests
* and let these go through as privileged allocations.
*/
- if (gfp_mask & __GFP_ATOMIC)
+ if (gfp_mask & __GFP_HIGH)
goto force;
/*
@@ -2688,7 +2692,7 @@ done_restock:
READ_ONCE(memcg->swap.high);
/* Don't bother a random interrupted task */
- if (in_interrupt()) {
+ if (!in_task()) {
if (mem_high) {
schedule_work(&memcg->high_work);
break;
@@ -2979,6 +2983,18 @@ static void memcg_free_cache_id(int id)
ida_simple_remove(&memcg_cache_ida, id);
}
+static void memcg_account_kmem(struct mem_cgroup *memcg, int nr_pages)
+{
+ mod_memcg_state(memcg, MEMCG_KMEM, nr_pages);
+ if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
+ if (nr_pages > 0)
+ page_counter_charge(&memcg->kmem, nr_pages);
+ else
+ page_counter_uncharge(&memcg->kmem, -nr_pages);
+ }
+}
+
+
/*
* obj_cgroup_uncharge_pages: uncharge a number of kernel pages from a objcg
* @objcg: object cgroup to uncharge
@@ -2991,8 +3007,7 @@ static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg,
memcg = get_mem_cgroup_from_objcg(objcg);
- if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
- page_counter_uncharge(&memcg->kmem, nr_pages);
+ memcg_account_kmem(memcg, -nr_pages);
refill_stock(memcg, nr_pages);
css_put(&memcg->css);
@@ -3018,8 +3033,7 @@ static int obj_cgroup_charge_pages(struct obj_cgroup *objcg, gfp_t gfp,
if (ret)
goto out;
- if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
- page_counter_charge(&memcg->kmem, nr_pages);
+ memcg_account_kmem(memcg, nr_pages);
out:
css_put(&memcg->css);
@@ -5067,18 +5081,8 @@ struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
{
struct mem_cgroup_per_node *pn;
- int tmp = node;
- /*
- * This routine is called against possible nodes.
- * But it's BUG to call kmalloc() against offline node.
- *
- * TODO: this routine can waste much memory for nodes which will
- * never be onlined. It's better to use memory hotplug callback
- * function.
- */
- if (!node_state(node, N_NORMAL_MEMORY))
- tmp = -1;
- pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
+
+ pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, node);
if (!pn)
return 1;
@@ -5090,8 +5094,6 @@ static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
}
lruvec_init(&pn->lruvec);
- pn->usage_in_excess = 0;
- pn->on_tree = false;
pn->memcg = memcg;
memcg->nodeinfo[node] = pn;
@@ -5691,8 +5693,8 @@ out:
* 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
* target for charge migration. if @target is not NULL, the entry is stored
* in target->ent.
- * 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is MEMORY_DEVICE_PRIVATE
- * (so ZONE_DEVICE page and thus not on the lru).
+ * 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is device memory and
+ * thus not on the lru.
* For now we such page is charge like a regular page would be as for all
* intent and purposes it is just special memory taking the place of a
* regular page.
@@ -5726,7 +5728,7 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
*/
if (page_memcg(page) == mc.from) {
ret = MC_TARGET_PAGE;
- if (is_device_private_page(page))
+ if (is_dev_private_or_coherent_page(page))
ret = MC_TARGET_DEVICE;
if (target)
target->page = page;
@@ -6801,8 +6803,8 @@ static void uncharge_batch(const struct uncharge_gather *ug)
page_counter_uncharge(&ug->memcg->memory, ug->nr_memory);
if (do_memsw_account())
page_counter_uncharge(&ug->memcg->memsw, ug->nr_memory);
- if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && ug->nr_kmem)
- page_counter_uncharge(&ug->memcg->kmem, ug->nr_kmem);
+ if (ug->nr_kmem)
+ memcg_account_kmem(ug->memcg, -ug->nr_kmem);
memcg_oom_recover(ug->memcg);
}
@@ -6968,7 +6970,7 @@ void mem_cgroup_sk_alloc(struct sock *sk)
return;
/* Do not associate the sock with unrelated interrupted task's memcg. */
- if (in_interrupt())
+ if (!in_task())
return;
rcu_read_lock();
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 97a9ed8f87a96..55edb0cc38483 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -707,8 +707,10 @@ static int kill_accessing_process(struct task_struct *p, unsigned long pfn,
(void *)&priv);
if (ret == 1 && priv.tk.addr)
kill_proc(&priv.tk, pfn, flags);
+ else
+ ret = 0;
mmap_read_unlock(p->mm);
- return ret ? -EFAULT : -EHWPOISON;
+ return ret > 0 ? -EHWPOISON : -EFAULT;
}
static const char *action_name[] = {
@@ -1617,12 +1619,16 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
goto unlock;
}
- if (pgmap->type == MEMORY_DEVICE_PRIVATE) {
+ switch (pgmap->type) {
+ case MEMORY_DEVICE_PRIVATE:
+ case MEMORY_DEVICE_COHERENT:
/*
- * TODO: Handle HMM pages which may need coordination
+ * TODO: Handle device pages which may need coordination
* with device-side memory.
*/
goto unlock;
+ default:
+ break;
}
/*
@@ -2150,12 +2156,6 @@ static int __soft_offline_page(struct page *page)
.gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL,
};
- /*
- * Check PageHWPoison again inside page lock because PageHWPoison
- * is set by memory_failure() outside page lock. Note that
- * memory_failure() also double-checks PageHWPoison inside page lock,
- * so there's no race between soft_offline_page() and memory_failure().
- */
lock_page(page);
if (!PageHuge(page))
wait_on_page_writeback(page);
diff --git a/mm/memory.c b/mm/memory.c
index c125c4969913a..7ace240802d85 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -735,9 +735,6 @@ static void restore_exclusive_pte(struct vm_area_struct *vma,
set_pte_at(vma->vm_mm, address, ptep, pte);
- if (vma->vm_flags & VM_LOCKED)
- mlock_vma_page(page);
-
/*
* No need to invalidate - it was non-present before. However
* secondary CPUs may have mappings that need invalidating.
@@ -1377,7 +1374,7 @@ again:
mark_page_accessed(page);
}
rss[mm_counter(page)]--;
- page_remove_rmap(page, false);
+ page_remove_rmap(page, vma, false);
if (unlikely(page_mapcount(page) < 0))
print_bad_pte(vma, addr, ptent, page);
if (unlikely(__tlb_remove_page(tlb, page))) {
@@ -1397,10 +1394,8 @@ again:
continue;
pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
rss[mm_counter(page)]--;
-
if (is_device_private_entry(entry))
- page_remove_rmap(page, false);
-
+ page_remove_rmap(page, vma, false);
put_page(page);
continue;
}
@@ -1753,16 +1748,16 @@ static int validate_page_before_insert(struct page *page)
return 0;
}
-static int insert_page_into_pte_locked(struct mm_struct *mm, pte_t *pte,
+static int insert_page_into_pte_locked(struct vm_area_struct *vma, pte_t *pte,
unsigned long addr, struct page *page, pgprot_t prot)
{
if (!pte_none(*pte))
return -EBUSY;
/* Ok, finally just insert the thing.. */
get_page(page);
- inc_mm_counter_fast(mm, mm_counter_file(page));
- page_add_file_rmap(page, false);
- set_pte_at(mm, addr, pte, mk_pte(page, prot));
+ inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page));
+ page_add_file_rmap(page, vma, false);
+ set_pte_at(vma->vm_mm, addr, pte, mk_pte(page, prot));
return 0;
}
@@ -1776,7 +1771,6 @@ static int insert_page_into_pte_locked(struct mm_struct *mm, pte_t *pte,
static int insert_page(struct vm_area_struct *vma, unsigned long addr,
struct page *page, pgprot_t prot)
{
- struct mm_struct *mm = vma->vm_mm;
int retval;
pte_t *pte;
spinlock_t *ptl;
@@ -1785,17 +1779,17 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr,
if (retval)
goto out;
retval = -ENOMEM;
- pte = get_locked_pte(mm, addr, &ptl);
+ pte = get_locked_pte(vma->vm_mm, addr, &ptl);
if (!pte)
goto out;
- retval = insert_page_into_pte_locked(mm, pte, addr, page, prot);
+ retval = insert_page_into_pte_locked(vma, pte, addr, page, prot);
pte_unmap_unlock(pte, ptl);
out:
return retval;
}
#ifdef pte_index
-static int insert_page_in_batch_locked(struct mm_struct *mm, pte_t *pte,
+static int insert_page_in_batch_locked(struct vm_area_struct *vma, pte_t *pte,
unsigned long addr, struct page *page, pgprot_t prot)
{
int err;
@@ -1805,7 +1799,7 @@ static int insert_page_in_batch_locked(struct mm_struct *mm, pte_t *pte,
err = validate_page_before_insert(page);
if (err)
return err;
- return insert_page_into_pte_locked(mm, pte, addr, page, prot);
+ return insert_page_into_pte_locked(vma, pte, addr, page, prot);
}
/* insert_pages() amortizes the cost of spinlock operations
@@ -1842,7 +1836,7 @@ more:
start_pte = pte_offset_map_lock(mm, pmd, addr, &pte_lock);
for (pte = start_pte; pte_idx < batch_size; ++pte, ++pte_idx) {
- int err = insert_page_in_batch_locked(mm, pte,
+ int err = insert_page_in_batch_locked(vma, pte,
addr, pages[curr_page_idx], prot);
if (unlikely(err)) {
pte_unmap_unlock(start_pte, pte_lock);
@@ -3098,7 +3092,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
* mapcount is visible. So transitively, TLBs to
* old page will be flushed before it can be reused.
*/
- page_remove_rmap(old_page, false);
+ page_remove_rmap(old_page, vma, false);
}
/* Free the old page.. */
@@ -3118,16 +3112,6 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
*/
mmu_notifier_invalidate_range_only_end(&range);
if (old_page) {
- /*
- * Don't let another task, with possibly unlocked vma,
- * keep the mlocked page.
- */
- if (page_copied && (vma->vm_flags & VM_LOCKED)) {
- lock_page(old_page); /* LRU manipulation */
- if (PageMlocked(old_page))
- munlock_vma_page(old_page);
- unlock_page(old_page);
- }
if (page_copied)
free_swap_cache(old_page);
put_page(old_page);
@@ -3291,19 +3275,35 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf)
if (PageAnon(vmf->page)) {
struct page *page = vmf->page;
- /* PageKsm() doesn't necessarily raise the page refcount */
- if (PageKsm(page) || page_count(page) != 1)
+ /*
+ * We have to verify under page lock: these early checks are
+ * just an optimization to avoid locking the page and freeing
+ * the swapcache if there is little hope that we can reuse.
+ *
+ * PageKsm() doesn't necessarily raise the page refcount.
+ */
+ if (PageKsm(page) || page_count(page) > 3)
+ goto copy;
+ if (!PageLRU(page))
+ /*
+ * Note: We cannot easily detect+handle references from
+ * remote LRU pagevecs or references to PageLRU() pages.
+ */
+ lru_add_drain();
+ if (page_count(page) > 1 + PageSwapCache(page))
goto copy;
if (!trylock_page(page))
goto copy;
- if (PageKsm(page) || page_mapcount(page) != 1 || page_count(page) != 1) {
+ if (PageSwapCache(page))
+ try_to_free_swap(page);
+ if (PageKsm(page) || page_count(page) != 1) {
unlock_page(page);
goto copy;
}
/*
- * Ok, we've got the only map reference, and the only
- * page count reference, and the page is locked,
- * it's dark out, and we're wearing sunglasses. Hit it.
+ * Ok, we've got the only page reference from our mapping
+ * and the page is locked, it's dark out, and we're wearing
+ * sunglasses. Hit it.
*/
unlock_page(page);
wp_page_reuse(vmf);
@@ -3481,6 +3481,25 @@ static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf)
return 0;
}
+static inline bool should_try_to_free_swap(struct page *page,
+ struct vm_area_struct *vma,
+ unsigned int fault_flags)
+{
+ if (!PageSwapCache(page))
+ return false;
+ if (mem_cgroup_swap_full(page) || (vma->vm_flags & VM_LOCKED) ||
+ PageMlocked(page))
+ return true;
+ /*
+ * If we want to map a page that's in the swapcache writable, we
+ * have to detect via the refcount if we're really the exclusive
+ * user. Try freeing the swapcache to get rid of the swapcache
+ * reference only in case it's likely that we'll be the exlusive user.
+ */
+ return (fault_flags & FAULT_FLAG_WRITE) && !PageKsm(page) &&
+ page_count(page) == 2;
+}
+
/*
* We enter with non-exclusive mmap_lock (to exclude vma changes,
* but allow concurrent faults), and pte mapped but not yet locked.
@@ -3599,21 +3618,39 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
goto out_release;
}
- /*
- * Make sure try_to_free_swap or reuse_swap_page or swapoff did not
- * release the swapcache from under us. The page pin, and pte_same
- * test below, are not enough to exclude that. Even if it is still
- * swapcache, we need to check that the page's swap has not changed.
- */
- if (unlikely((!PageSwapCache(page) ||
- page_private(page) != entry.val)) && swapcache)
- goto out_page;
-
- page = ksm_might_need_to_copy(page, vma, vmf->address);
- if (unlikely(!page)) {
- ret = VM_FAULT_OOM;
- page = swapcache;
- goto out_page;
+ if (swapcache) {
+ /*
+ * Make sure try_to_free_swap or reuse_swap_page or swapoff did
+ * not release the swapcache from under us. The page pin, and
+ * pte_same test below, are not enough to exclude that. Even if
+ * it is still swapcache, we need to check that the page's swap
+ * has not changed.
+ */
+ if (unlikely(!PageSwapCache(page) ||
+ page_private(page) != entry.val))
+ goto out_page;
+
+ /*
+ * KSM sometimes has to copy on read faults, for example, if
+ * page->index of !PageKSM() pages would be nonlinear inside the
+ * anon VMA -- PageKSM() is lost on actual swapout.
+ */
+ page = ksm_might_need_to_copy(page, vma, vmf->address);
+ if (unlikely(!page)) {
+ ret = VM_FAULT_OOM;
+ page = swapcache;
+ goto out_page;
+ }
+
+ /*
+ * If we want to map a page that's in the swapcache writable, we
+ * have to detect via the refcount if we're really the exclusive
+ * owner. Try removing the extra reference from the local LRU
+ * pagevecs if required.
+ */
+ if ((vmf->flags & FAULT_FLAG_WRITE) && page == swapcache &&
+ !PageKsm(page) && !PageLRU(page))
+ lru_add_drain();
}
cgroup_throttle_swaprate(page, GFP_KERNEL);
@@ -3632,19 +3669,25 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
}
/*
- * The page isn't present yet, go ahead with the fault.
- *
- * Be careful about the sequence of operations here.
- * To get its accounting right, reuse_swap_page() must be called
- * while the page is counted on swap but not yet in mapcount i.e.
- * before page_add_anon_rmap() and swap_free(); try_to_free_swap()
- * must be called after the swap_free(), or it will never succeed.
+ * Remove the swap entry and conditionally try to free up the swapcache.
+ * We're already holding a reference on the page but haven't mapped it
+ * yet.
*/
+ swap_free(entry);
+ if (should_try_to_free_swap(page, vma, vmf->flags))
+ try_to_free_swap(page);
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS);
pte = mk_pte(page, vma->vm_page_prot);
- if ((vmf->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) {
+
+ /*
+ * Same logic as in do_wp_page(); however, optimize for fresh pages
+ * that are certainly not shared because we just allocated them without
+ * exposing them to the swapcache.
+ */
+ if ((vmf->flags & FAULT_FLAG_WRITE) && !PageKsm(page) &&
+ (page != swapcache || page_count(page) == 1)) {
pte = maybe_mkwrite(pte_mkdirty(pte), vma);
vmf->flags &= ~FAULT_FLAG_WRITE;
ret |= VM_FAULT_WRITE;
@@ -3670,10 +3713,6 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte);
- swap_free(entry);
- if (mem_cgroup_swap_full(page) ||
- (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
- try_to_free_swap(page);
unlock_page(page);
if (page != swapcache && swapcache) {
/*
@@ -3947,7 +3986,8 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
add_mm_counter(vma->vm_mm, mm_counter_file(page), HPAGE_PMD_NR);
- page_add_file_rmap(page, true);
+ page_add_file_rmap(page, vma, true);
+
/*
* deposit and withdraw with pmd lock held
*/
@@ -3996,7 +4036,7 @@ void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr)
lru_cache_add_inactive_or_unevictable(page, vma);
} else {
inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page));
- page_add_file_rmap(page, false);
+ page_add_file_rmap(page, vma, false);
}
set_pte_at(vma->vm_mm, addr, vmf->pte, entry);
}
@@ -5444,6 +5484,8 @@ long copy_huge_page_from_user(struct page *dst_page,
if (rc)
break;
+ flush_dcache_page(subpage);
+
cond_resched();
}
return ret_val;
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 2a9627dc784c3..ce68098832aa9 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -295,12 +295,6 @@ struct page *pfn_to_online_page(unsigned long pfn)
}
EXPORT_SYMBOL_GPL(pfn_to_online_page);
-/*
- * Reasonably generic function for adding memory. It is
- * expected that archs that support memory hotplug will
- * call this function after deciding the zone to which to
- * add the new pages.
- */
int __ref __add_pages(int nid, unsigned long pfn, unsigned long nr_pages,
struct mhp_params *params)
{
@@ -829,7 +823,7 @@ static struct zone *default_kernel_zone_for_pfn(int nid, unsigned long start_pfn
struct pglist_data *pgdat = NODE_DATA(nid);
int zid;
- for (zid = 0; zid <= ZONE_NORMAL; zid++) {
+ for (zid = 0; zid < ZONE_NORMAL; zid++) {
struct zone *zone = &pgdat->node_zones[zid];
if (zone_intersects(zone, start_pfn, nr_pages))
@@ -1162,43 +1156,20 @@ static void reset_node_present_pages(pg_data_t *pgdat)
}
/* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */
-static pg_data_t __ref *hotadd_new_pgdat(int nid)
+static pg_data_t __ref *hotadd_init_pgdat(int nid)
{
struct pglist_data *pgdat;
+ /*
+ * NODE_DATA is preallocated (free_area_init) but its internal
+ * state is not allocated completely. Add missing pieces.
+ * Completely offline nodes stay around and they just need
+ * reintialization.
+ */
pgdat = NODE_DATA(nid);
- if (!pgdat) {
- pgdat = arch_alloc_nodedata(nid);
- if (!pgdat)
- return NULL;
-
- pgdat->per_cpu_nodestats =
- alloc_percpu(struct per_cpu_nodestat);
- arch_refresh_nodedata(nid, pgdat);
- } else {
- int cpu;
- /*
- * Reset the nr_zones, order and highest_zoneidx before reuse.
- * Note that kswapd will init kswapd_highest_zoneidx properly
- * when it starts in the near future.
- */
- pgdat->nr_zones = 0;
- pgdat->kswapd_order = 0;
- pgdat->kswapd_highest_zoneidx = 0;
- for_each_online_cpu(cpu) {
- struct per_cpu_nodestat *p;
-
- p = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu);
- memset(p, 0, sizeof(*p));
- }
- }
-
- /* we can use NODE_DATA(nid) from here */
- pgdat->node_id = nid;
- pgdat->node_start_pfn = 0;
/* init node's zones as empty zones, we don't have any present pages.*/
- free_area_init_core_hotplug(nid);
+ free_area_init_core_hotplug(pgdat);
/*
* The node we allocated has no zone fallback lists. For avoiding
@@ -1210,6 +1181,7 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid)
* When memory is hot-added, all the memory is in offline state. So
* clear all zones' present_pages because they will be updated in
* online_pages() and offline_pages().
+ * TODO: should be in free_area_init_core_hotplug?
*/
reset_node_managed_pages(pgdat);
reset_node_present_pages(pgdat);
@@ -1217,16 +1189,6 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid)
return pgdat;
}
-static void rollback_node_hotadd(int nid)
-{
- pg_data_t *pgdat = NODE_DATA(nid);
-
- arch_refresh_nodedata(nid, NULL);
- free_percpu(pgdat->per_cpu_nodestats);
- arch_free_nodedata(pgdat);
-}
-
-
/*
* __try_online_node - online a node if offlined
* @nid: the node ID
@@ -1246,7 +1208,7 @@ static int __try_online_node(int nid, bool set_node_online)
if (node_online(nid))
return 0;
- pgdat = hotadd_new_pgdat(nid);
+ pgdat = hotadd_init_pgdat(nid);
if (!pgdat) {
pr_err("Cannot online node %d due to NULL pgdat\n", nid);
ret = -ENOMEM;
@@ -1327,7 +1289,7 @@ bool mhp_supports_memmap_on_memory(unsigned long size)
* populate a single PMD.
*/
return memmap_on_memory &&
- !hugetlb_free_vmemmap_enabled &&
+ !hugetlb_free_vmemmap_enabled() &&
IS_ENABLED(CONFIG_MHP_MEMMAP_ON_MEMORY) &&
size == memory_block_size_bytes() &&
IS_ALIGNED(vmemmap_size, PMD_SIZE) &&
@@ -1445,9 +1407,6 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
return ret;
error:
- /* rollback pgdat allocation and others */
- if (new_node)
- rollback_node_hotadd(nid);
if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK))
memblock_remove(start, size);
error_mem_hotplug_end:
@@ -2004,6 +1963,7 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages,
return 0;
failed_removal_isolated:
+ /* pushback to free area */
undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
memory_notify(MEM_CANCEL_OFFLINE, &arg);
failed_removal_pcplists_disabled:
@@ -2014,7 +1974,6 @@ failed_removal:
(unsigned long long) start_pfn << PAGE_SHIFT,
((unsigned long long) end_pfn << PAGE_SHIFT) - 1,
reason);
- /* pushback to free area */
mem_hotplug_done();
return ret;
}
@@ -2046,12 +2005,12 @@ static int get_nr_vmemmap_pages_cb(struct memory_block *mem, void *arg)
return mem->nr_vmemmap_pages;
}
-static int check_cpu_on_node(pg_data_t *pgdat)
+static int check_cpu_on_node(int nid)
{
int cpu;
for_each_present_cpu(cpu) {
- if (cpu_to_node(cpu) == pgdat->node_id)
+ if (cpu_to_node(cpu) == nid)
/*
* the cpu on this node isn't removed, and we can't
* offline this node.
@@ -2085,7 +2044,6 @@ static int check_no_memblock_for_node_cb(struct memory_block *mem, void *arg)
*/
void try_offline_node(int nid)
{
- pg_data_t *pgdat = NODE_DATA(nid);
int rc;
/*
@@ -2093,7 +2051,7 @@ void try_offline_node(int nid)
* offline it. A node spans memory after move_pfn_range_to_zone(),
* e.g., after the memory block was onlined.
*/
- if (pgdat->node_spanned_pages)
+ if (node_spanned_pages(nid))
return;
/*
@@ -2105,7 +2063,7 @@ void try_offline_node(int nid)
if (rc)
return;
- if (check_cpu_on_node(pgdat))
+ if (check_cpu_on_node(nid))
return;
/*
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 028e8dd82b442..7c852793d9e85 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -123,7 +123,7 @@ enum zone_type policy_zone = 0;
* run-time system-wide default policy => local allocation
*/
static struct mempolicy default_policy = {
- .refcnt = ATOMIC_INIT(1), /* never free it */
+ .refcnt = { ATOMIC_INIT(1), }, /* never free it */
.mode = MPOL_LOCAL,
};
@@ -295,7 +295,7 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
if (!policy)
return ERR_PTR(-ENOMEM);
- atomic_set(&policy->refcnt, 1);
+ refcount_set(&policy->refcnt, 1);
policy->mode = mode;
policy->flags = flags;
policy->home_node = NUMA_NO_NODE;
@@ -306,7 +306,7 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
/* Slow path of a mpol destructor. */
void __mpol_put(struct mempolicy *p)
{
- if (!atomic_dec_and_test(&p->refcnt))
+ if (!refcount_dec_and_test(&p->refcnt))
return;
kmem_cache_free(policy_cache, p);
}
@@ -907,17 +907,14 @@ static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
static int lookup_node(struct mm_struct *mm, unsigned long addr)
{
struct page *p = NULL;
- int err;
+ int ret;
- int locked = 1;
- err = get_user_pages_locked(addr & PAGE_MASK, 1, 0, &p, &locked);
- if (err > 0) {
- err = page_to_nid(p);
+ ret = get_user_pages_fast(addr & PAGE_MASK, 1, 0, &p);
+ if (ret > 0) {
+ ret = page_to_nid(p);
put_page(p);
}
- if (locked)
- mmap_read_unlock(mm);
- return err;
+ return ret;
}
/* Retrieve NUMA policy */
@@ -968,14 +965,14 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
if (flags & MPOL_F_NODE) {
if (flags & MPOL_F_ADDR) {
/*
- * Take a refcount on the mpol, lookup_node()
- * will drop the mmap_lock, so after calling
- * lookup_node() only "pol" remains valid, "vma"
- * is stale.
+ * Take a refcount on the mpol, because we are about to
+ * drop the mmap_lock, after which only "pol" remains
+ * valid, "vma" is stale.
*/
pol_refcount = pol;
vma = NULL;
mpol_get(pol);
+ mmap_read_unlock(mm);
err = lookup_node(mm, addr);
if (err < 0)
goto out;
@@ -2409,7 +2406,7 @@ struct mempolicy *__mpol_dup(struct mempolicy *old)
nodemask_t mems = cpuset_mems_allowed(current);
mpol_rebind_policy(new, &mems);
}
- atomic_set(&new->refcnt, 1);
+ refcount_set(&new->refcnt, 1);
return new;
}
@@ -2706,7 +2703,7 @@ restart:
goto alloc_new;
*mpol_new = *n->policy;
- atomic_set(&mpol_new->refcnt, 1);
+ refcount_set(&mpol_new->refcnt, 1);
sp_node_init(n_new, end, n->end, mpol_new);
n->end = start;
sp_insert(sp, n_new);
@@ -2900,7 +2897,7 @@ void __init numa_policy_init(void)
for_each_node(nid) {
preferred_node_policy[nid] = (struct mempolicy) {
- .refcnt = ATOMIC_INIT(1),
+ .refcnt = { ATOMIC_INIT(1), },
.mode = MPOL_PREFERRED,
.flags = MPOL_F_MOF | MPOL_F_MORON,
.nodes = nodemask_of_node(nid),
diff --git a/mm/memremap.c b/mm/memremap.c
index 6aa5f0c2d11fd..71b8d42d820c0 100644
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -44,6 +44,7 @@ EXPORT_SYMBOL(devmap_managed_key);
static void devmap_managed_enable_put(struct dev_pagemap *pgmap)
{
if (pgmap->type == MEMORY_DEVICE_PRIVATE ||
+ pgmap->type == MEMORY_DEVICE_COHERENT ||
pgmap->type == MEMORY_DEVICE_FS_DAX)
static_branch_dec(&devmap_managed_key);
}
@@ -51,6 +52,7 @@ static void devmap_managed_enable_put(struct dev_pagemap *pgmap)
static void devmap_managed_enable_get(struct dev_pagemap *pgmap)
{
if (pgmap->type == MEMORY_DEVICE_PRIVATE ||
+ pgmap->type == MEMORY_DEVICE_COHERENT ||
pgmap->type == MEMORY_DEVICE_FS_DAX)
static_branch_inc(&devmap_managed_key);
}
@@ -115,6 +117,26 @@ static unsigned long pfn_len(struct dev_pagemap *pgmap, unsigned long range_id)
pfn_first(pgmap, range_id)) >> pgmap->vmemmap_shift;
}
+/*
+ * This returns true if the page is reserved by ZONE_DEVICE driver.
+ */
+bool pfn_zone_device_reserved(unsigned long pfn)
+{
+ struct dev_pagemap *pgmap;
+ struct vmem_altmap *altmap;
+ bool ret = false;
+
+ pgmap = get_dev_pagemap(pfn, NULL);
+ if (!pgmap)
+ return ret;
+ altmap = pgmap_altmap(pgmap);
+ if (altmap && pfn < (altmap->base_pfn + altmap->reserve))
+ ret = true;
+ put_dev_pagemap(pgmap);
+
+ return ret;
+}
+
#define for_each_device_pfn(pfn, map, i) \
for (pfn = pfn_first(map, i); pfn < pfn_end(map, i); \
pfn = pfn_next(map, pfn))
@@ -282,7 +304,8 @@ static int pagemap_range(struct dev_pagemap *pgmap, struct mhp_params *params,
return 0;
err_add_memory:
- kasan_remove_zero_shadow(__va(range->start), range_len(range));
+ if (!is_private)
+ kasan_remove_zero_shadow(__va(range->start), range_len(range));
err_kasan:
untrack_pfn(NULL, PHYS_PFN(range->start), range_len(range));
err_pfn_remap:
@@ -327,6 +350,16 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid)
return ERR_PTR(-EINVAL);
}
break;
+ case MEMORY_DEVICE_COHERENT:
+ if (!pgmap->ops->page_free) {
+ WARN(1, "Missing page_free method\n");
+ return ERR_PTR(-EINVAL);
+ }
+ if (!pgmap->owner) {
+ WARN(1, "Missing owner\n");
+ return ERR_PTR(-EINVAL);
+ }
+ break;
case MEMORY_DEVICE_FS_DAX:
if (!IS_ENABLED(CONFIG_ZONE_DEVICE) ||
IS_ENABLED(CONFIG_FS_DAX_LIMITED)) {
@@ -469,7 +502,7 @@ EXPORT_SYMBOL_GPL(get_dev_pagemap);
void free_devmap_managed_page(struct page *page)
{
/* notify page idle for dax */
- if (!is_device_private_page(page)) {
+ if (!is_dev_private_or_coherent_page(page)) {
wake_up_var(&page->_refcount);
return;
}
diff --git a/mm/migrate.c b/mm/migrate.c
index c7da064b4781b..23b5ac8892b4e 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -51,10 +51,10 @@
#include <linux/oom.h>
#include <linux/memory.h>
#include <linux/random.h>
+#include <linux/sched/sysctl.h>
#include <asm/tlbflush.h>
-#define CREATE_TRACE_POINTS
#include <trace/events/migrate.h>
#include "internal.h"
@@ -248,14 +248,14 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma,
if (PageAnon(new))
page_add_anon_rmap(new, vma, pvmw.address, false);
else
- page_add_file_rmap(new, false);
+ page_add_file_rmap(new, vma, false);
set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte);
}
- if (vma->vm_flags & VM_LOCKED && !PageTransCompound(new))
- mlock_vma_page(new);
+ if (vma->vm_flags & VM_LOCKED)
+ mlock_page_drain(smp_processor_id());
- if (PageTransHuge(page) && PageMlocked(page))
- clear_page_mlock(page);
+ trace_remove_migration_pte(pvmw.address, pte_val(pte),
+ compound_order(new));
/* No need to invalidate - it was non-present before */
update_mmu_cache(vma, pvmw.address, pvmw.pte);
@@ -345,7 +345,7 @@ static int expected_page_refs(struct address_space *mapping, struct page *page)
* Device private pages have an extra refcount as they are
* ZONE_DEVICE pages.
*/
- expected_count += is_device_private_page(page);
+ expected_count += is_dev_private_or_coherent_page(page);
if (mapping)
expected_count += compound_nr(page) + page_has_private(page);
@@ -917,8 +917,7 @@ static int move_to_new_page(struct page *newpage, struct page *page,
page->mapping = NULL;
if (likely(!is_zone_device_page(newpage)))
- flush_dcache_page(newpage);
-
+ flush_dcache_folio(page_folio(newpage));
}
out:
return rc;
@@ -1037,6 +1036,21 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
if (!page_mapped(page))
rc = move_to_new_page(newpage, page, mode);
+ /*
+ * When successful, push newpage to LRU immediately: so that if it
+ * turns out to be an mlocked page, remove_migration_ptes() will
+ * automatically build up the correct newpage->mlock_count for it.
+ *
+ * We would like to do something similar for the old page, when
+ * unsuccessful, and other cases when a page has been temporarily
+ * isolated from the unevictable LRU: but this case is the easiest.
+ */
+ if (rc == MIGRATEPAGE_SUCCESS) {
+ lru_cache_add(newpage);
+ if (page_was_mapped)
+ lru_add_drain();
+ }
+
if (page_was_mapped)
remove_migration_ptes(page,
rc == MIGRATEPAGE_SUCCESS ? newpage : page, false);
@@ -1050,20 +1064,12 @@ out_unlock:
unlock_page(page);
out:
/*
- * If migration is successful, decrease refcount of the newpage
+ * If migration is successful, decrease refcount of the newpage,
* which will not free the page because new page owner increased
- * refcounter. As well, if it is LRU page, add the page to LRU
- * list in here. Use the old state of the isolated source page to
- * determine if we migrated a LRU page. newpage was already unlocked
- * and possibly modified by its owner - don't rely on the page
- * state.
+ * refcounter.
*/
- if (rc == MIGRATEPAGE_SUCCESS) {
- if (unlikely(!is_lru))
- put_page(newpage);
- else
- putback_lru_page(newpage);
- }
+ if (rc == MIGRATEPAGE_SUCCESS)
+ put_page(newpage);
return rc;
}
@@ -1762,6 +1768,13 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes,
}
/*
+ * The move_pages() man page does not have an -EEXIST choice, so
+ * use -EFAULT instead.
+ */
+ if (err == -EEXIST)
+ err = -EFAULT;
+
+ /*
* If the page is already on the target node (!err), store the
* node, otherwise, store the err.
*/
@@ -2034,16 +2047,30 @@ static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
{
int page_lru;
int nr_pages = thp_nr_pages(page);
+ int order = compound_order(page);
- VM_BUG_ON_PAGE(compound_order(page) && !PageTransHuge(page), page);
+ VM_BUG_ON_PAGE(order && !PageTransHuge(page), page);
/* Do not migrate THP mapped by multiple processes */
if (PageTransHuge(page) && total_mapcount(page) > 1)
return 0;
/* Avoid migrating to a node that is nearly full */
- if (!migrate_balanced_pgdat(pgdat, nr_pages))
+ if (!migrate_balanced_pgdat(pgdat, nr_pages)) {
+ int z;
+
+ if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) ||
+ !numa_demotion_enabled)
+ return 0;
+ if (next_demotion_node(pgdat->node_id) == NUMA_NO_NODE)
+ return 0;
+ for (z = pgdat->nr_zones - 1; z >= 0; z--) {
+ if (populated_zone(pgdat->node_zones + z))
+ break;
+ }
+ wakeup_kswapd(pgdat->node_zones + z, 0, order, ZONE_MOVABLE);
return 0;
+ }
if (isolate_lru_page(page))
return 0;
@@ -2072,6 +2099,7 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
pg_data_t *pgdat = NODE_DATA(node);
int isolated;
int nr_remaining;
+ int nr_succeeded;
LIST_HEAD(migratepages);
new_page_t *new;
bool compound;
@@ -2110,7 +2138,8 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
list_add(&page->lru, &migratepages);
nr_remaining = migrate_pages(&migratepages, *new, NULL, node,
- MIGRATE_ASYNC, MR_NUMA_MISPLACED, NULL);
+ MIGRATE_ASYNC, MR_NUMA_MISPLACED,
+ &nr_succeeded);
if (nr_remaining) {
if (!list_empty(&migratepages)) {
list_del(&page->lru);
@@ -2119,8 +2148,13 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
putback_lru_page(page);
}
isolated = 0;
- } else
- count_vm_numa_events(NUMA_PAGE_MIGRATE, nr_pages);
+ }
+ if (nr_succeeded) {
+ count_vm_numa_events(NUMA_PAGE_MIGRATE, nr_succeeded);
+ if (!node_is_toptier(page_to_nid(page)) && node_is_toptier(node))
+ mod_node_page_state(NODE_DATA(node), PGPROMOTE_SUCCESS,
+ nr_succeeded);
+ }
BUG_ON(!list_empty(&migratepages));
return isolated;
@@ -2264,15 +2298,21 @@ again:
if (is_writable_device_private_entry(entry))
mpfn |= MIGRATE_PFN_WRITE;
} else {
- if (!(migrate->flags & MIGRATE_VMA_SELECT_SYSTEM))
- goto next;
pfn = pte_pfn(pte);
- if (is_zero_pfn(pfn)) {
+ if (is_zero_pfn(pfn) &&
+ (migrate->flags & MIGRATE_VMA_SELECT_SYSTEM)) {
mpfn = MIGRATE_PFN_MIGRATE;
migrate->cpages++;
goto next;
}
page = vm_normal_page(migrate->vma, addr, pte);
+ if (page && !is_zone_device_page(page) &&
+ !(migrate->flags & MIGRATE_VMA_SELECT_SYSTEM))
+ goto next;
+ else if (page && is_device_coherent_page(page) &&
+ (!(migrate->flags & MIGRATE_VMA_SELECT_DEVICE_COHERENT) ||
+ page->pgmap->owner != migrate->pgmap_owner))
+ goto next;
mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE;
mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0;
}
@@ -2331,7 +2371,7 @@ again:
* drop page refcount. Page won't be freed, as we took
* a reference just above.
*/
- page_remove_rmap(page, false);
+ page_remove_rmap(page, vma, false);
put_page(page);
if (pte_present(pte))
@@ -2573,24 +2613,24 @@ int migrate_vma_setup(struct migrate_vma *args)
args->start &= PAGE_MASK;
args->end &= PAGE_MASK;
- if (!args->vma || is_vm_hugetlb_page(args->vma) ||
- (args->vma->vm_flags & VM_SPECIAL) || vma_is_dax(args->vma))
- return -EINVAL;
- if (nr_pages <= 0)
- return -EINVAL;
- if (args->start < args->vma->vm_start ||
- args->start >= args->vma->vm_end)
- return -EINVAL;
- if (args->end <= args->vma->vm_start || args->end > args->vma->vm_end)
- return -EINVAL;
if (!args->src || !args->dst)
return -EINVAL;
-
- memset(args->src, 0, sizeof(*args->src) * nr_pages);
- args->cpages = 0;
- args->npages = 0;
-
- migrate_vma_collect(args);
+ if (args->vma) {
+ if (is_vm_hugetlb_page(args->vma) ||
+ (args->vma->vm_flags & VM_SPECIAL) || vma_is_dax(args->vma))
+ return -EINVAL;
+ if (args->start < args->vma->vm_start ||
+ args->start >= args->vma->vm_end)
+ return -EINVAL;
+ if (args->end <= args->vma->vm_start || args->end > args->vma->vm_end)
+ return -EINVAL;
+
+ memset(args->src, 0, sizeof(*args->src) * nr_pages);
+ args->cpages = 0;
+ args->npages = 0;
+
+ migrate_vma_collect(args);
+ }
if (args->cpages)
migrate_vma_unmap(args);
@@ -2611,7 +2651,7 @@ EXPORT_SYMBOL(migrate_vma_setup);
* handle_pte_fault()
* do_anonymous_page()
* to map in an anonymous zero page but the struct page will be a ZONE_DEVICE
- * private page.
+ * private or coherent page.
*/
static void migrate_vma_insert_page(struct migrate_vma *migrate,
unsigned long addr,
@@ -2676,25 +2716,24 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate,
*/
__SetPageUptodate(page);
- if (is_zone_device_page(page)) {
- if (is_device_private_page(page)) {
- swp_entry_t swp_entry;
+ if (is_device_private_page(page)) {
+ swp_entry_t swp_entry;
- if (vma->vm_flags & VM_WRITE)
- swp_entry = make_writable_device_private_entry(
- page_to_pfn(page));
- else
- swp_entry = make_readable_device_private_entry(
- page_to_pfn(page));
- entry = swp_entry_to_pte(swp_entry);
- } else {
- /*
- * For now we only support migrating to un-addressable
- * device memory.
- */
- pr_warn_once("Unsupported ZONE_DEVICE page type.\n");
- goto abort;
- }
+ if (vma->vm_flags & VM_WRITE)
+ swp_entry = make_writable_device_private_entry(
+ page_to_pfn(page));
+ else
+ swp_entry = make_readable_device_private_entry(
+ page_to_pfn(page));
+ entry = swp_entry_to_pte(swp_entry);
+ } else if (is_zone_device_page(page) &&
+ !is_device_coherent_page(page)) {
+ /*
+ * We support migrating to private and coherent types
+ * for device zone memory.
+ */
+ pr_warn_once("Unsupported ZONE_DEVICE page type.\n");
+ goto abort;
} else {
entry = mk_pte(page, vma->vm_page_prot);
if (vma->vm_flags & VM_WRITE)
@@ -2776,7 +2815,7 @@ void migrate_vma_pages(struct migrate_vma *migrate)
continue;
}
- if (!page) {
+ if (!page && migrate->vma) {
if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE))
continue;
if (!notified) {
@@ -2796,10 +2835,10 @@ void migrate_vma_pages(struct migrate_vma *migrate)
mapping = page_mapping(page);
if (is_zone_device_page(newpage)) {
- if (is_device_private_page(newpage)) {
+ if (is_dev_private_or_coherent_page(newpage)) {
/*
- * For now only support private anonymous when
- * migrating to un-addressable device memory.
+ * For now only support private and coherent
+ * anonymous when migrating to device memory.
*/
if (mapping) {
migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
@@ -3082,18 +3121,21 @@ static int establish_migrate_target(int node, nodemask_t *used,
if (best_distance != -1) {
val = node_distance(node, migration_target);
if (val > best_distance)
- return NUMA_NO_NODE;
+ goto out_clear;
}
index = nd->nr;
if (WARN_ONCE(index >= DEMOTION_TARGET_NODES,
"Exceeds maximum demotion target nodes\n"))
- return NUMA_NO_NODE;
+ goto out_clear;
nd->nodes[index] = migration_target;
nd->nr++;
return migration_target;
+out_clear:
+ node_clear(migration_target, *used);
+ return NUMA_NO_NODE;
}
/*
diff --git a/mm/mlock.c b/mm/mlock.c
index 8f584eddd3053..d50d48961b228 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -14,6 +14,7 @@
#include <linux/swapops.h>
#include <linux/pagemap.h>
#include <linux/pagevec.h>
+#include <linux/pagewalk.h>
#include <linux/mempolicy.h>
#include <linux/syscalls.h>
#include <linux/sched.h>
@@ -27,6 +28,8 @@
#include "internal.h"
+static DEFINE_PER_CPU(struct pagevec, mlock_pvec);
+
bool can_do_mlock(void)
{
if (rlimit(RLIMIT_MEMLOCK) != 0)
@@ -46,441 +49,312 @@ EXPORT_SYMBOL(can_do_mlock);
* be placed on the LRU "unevictable" list, rather than the [in]active lists.
* The unevictable list is an LRU sibling list to the [in]active lists.
* PageUnevictable is set to indicate the unevictable state.
- *
- * When lazy mlocking via vmscan, it is important to ensure that the
- * vma's VM_LOCKED status is not concurrently being modified, otherwise we
- * may have mlocked a page that is being munlocked. So lazy mlock must take
- * the mmap_lock for read, and verify that the vma really is locked
- * (see mm/rmap.c).
*/
-/*
- * LRU accounting for clear_page_mlock()
- */
-void clear_page_mlock(struct page *page)
+static struct lruvec *__mlock_page(struct page *page, struct lruvec *lruvec)
{
- int nr_pages;
+ /* There is nothing more we can do while it's off LRU */
+ if (!TestClearPageLRU(page))
+ return lruvec;
- if (!TestClearPageMlocked(page))
- return;
+ lruvec = folio_lruvec_relock_irq(page_folio(page), lruvec);
- nr_pages = thp_nr_pages(page);
- mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);
- count_vm_events(UNEVICTABLE_PGCLEARED, nr_pages);
- /*
- * The previous TestClearPageMlocked() corresponds to the smp_mb()
- * in __pagevec_lru_add_fn().
- *
- * See __pagevec_lru_add_fn for more explanation.
- */
- if (!isolate_lru_page(page)) {
- putback_lru_page(page);
- } else {
+ if (unlikely(page_evictable(page))) {
/*
- * We lost the race. the page already moved to evictable list.
+ * This is a little surprising, but quite possible:
+ * PageMlocked must have got cleared already by another CPU.
+ * Could this page be on the Unevictable LRU? I'm not sure,
+ * but move it now if so.
*/
- if (PageUnevictable(page))
- count_vm_events(UNEVICTABLE_PGSTRANDED, nr_pages);
+ if (PageUnevictable(page)) {
+ del_page_from_lru_list(page, lruvec);
+ ClearPageUnevictable(page);
+ add_page_to_lru_list(page, lruvec);
+ __count_vm_events(UNEVICTABLE_PGRESCUED,
+ thp_nr_pages(page));
+ }
+ goto out;
}
+
+ if (PageUnevictable(page)) {
+ if (PageMlocked(page))
+ page->mlock_count++;
+ goto out;
+ }
+
+ del_page_from_lru_list(page, lruvec);
+ ClearPageActive(page);
+ SetPageUnevictable(page);
+ page->mlock_count = !!PageMlocked(page);
+ add_page_to_lru_list(page, lruvec);
+ __count_vm_events(UNEVICTABLE_PGCULLED, thp_nr_pages(page));
+out:
+ SetPageLRU(page);
+ return lruvec;
}
-/*
- * Mark page as mlocked if not already.
- * If page on LRU, isolate and putback to move to unevictable list.
- */
-void mlock_vma_page(struct page *page)
+static struct lruvec *__mlock_new_page(struct page *page, struct lruvec *lruvec)
{
- /* Serialize with page migration */
- BUG_ON(!PageLocked(page));
+ VM_BUG_ON_PAGE(PageLRU(page), page);
- VM_BUG_ON_PAGE(PageTail(page), page);
- VM_BUG_ON_PAGE(PageCompound(page) && PageDoubleMap(page), page);
+ lruvec = folio_lruvec_relock_irq(page_folio(page), lruvec);
- if (!TestSetPageMlocked(page)) {
- int nr_pages = thp_nr_pages(page);
+ /* As above, this is a little surprising, but possible */
+ if (unlikely(page_evictable(page)))
+ goto out;
- mod_zone_page_state(page_zone(page), NR_MLOCK, nr_pages);
- count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages);
- if (!isolate_lru_page(page))
- putback_lru_page(page);
- }
+ SetPageUnevictable(page);
+ page->mlock_count = !!PageMlocked(page);
+ __count_vm_events(UNEVICTABLE_PGCULLED, thp_nr_pages(page));
+out:
+ add_page_to_lru_list(page, lruvec);
+ SetPageLRU(page);
+ return lruvec;
}
-/*
- * Finish munlock after successful page isolation
- *
- * Page must be locked. This is a wrapper for page_mlock()
- * and putback_lru_page() with munlock accounting.
- */
-static void __munlock_isolated_page(struct page *page)
+static struct lruvec *__munlock_page(struct page *page, struct lruvec *lruvec)
{
- /*
- * Optimization: if the page was mapped just once, that's our mapping
- * and we don't need to check all the other vmas.
- */
- if (page_mapcount(page) > 1)
- page_mlock(page);
+ int nr_pages = thp_nr_pages(page);
+ bool isolated = false;
- /* Did try_to_unlock() succeed or punt? */
- if (!PageMlocked(page))
- count_vm_events(UNEVICTABLE_PGMUNLOCKED, thp_nr_pages(page));
+ if (!TestClearPageLRU(page))
+ goto munlock;
- putback_lru_page(page);
+ isolated = true;
+ lruvec = folio_lruvec_relock_irq(page_folio(page), lruvec);
+
+ if (PageUnevictable(page)) {
+ /* Then mlock_count is maintained, but might undercount */
+ if (page->mlock_count)
+ page->mlock_count--;
+ if (page->mlock_count)
+ goto out;
+ }
+ /* else assume that was the last mlock: reclaim will fix it if not */
+
+munlock:
+ if (TestClearPageMlocked(page)) {
+ __mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);
+ if (isolated || !PageUnevictable(page))
+ __count_vm_events(UNEVICTABLE_PGMUNLOCKED, nr_pages);
+ else
+ __count_vm_events(UNEVICTABLE_PGSTRANDED, nr_pages);
+ }
+
+ /* page_evictable() has to be checked *after* clearing Mlocked */
+ if (isolated && PageUnevictable(page) && page_evictable(page)) {
+ del_page_from_lru_list(page, lruvec);
+ ClearPageUnevictable(page);
+ add_page_to_lru_list(page, lruvec);
+ __count_vm_events(UNEVICTABLE_PGRESCUED, nr_pages);
+ }
+out:
+ if (isolated)
+ SetPageLRU(page);
+ return lruvec;
}
/*
- * Accounting for page isolation fail during munlock
- *
- * Performs accounting when page isolation fails in munlock. There is nothing
- * else to do because it means some other task has already removed the page
- * from the LRU. putback_lru_page() will take care of removing the page from
- * the unevictable list, if necessary. vmscan [page_referenced()] will move
- * the page back to the unevictable list if some other vma has it mlocked.
+ * Flags held in the low bits of a struct page pointer on the mlock_pvec.
*/
-static void __munlock_isolation_failed(struct page *page)
-{
- int nr_pages = thp_nr_pages(page);
-
- if (PageUnevictable(page))
- __count_vm_events(UNEVICTABLE_PGSTRANDED, nr_pages);
- else
- __count_vm_events(UNEVICTABLE_PGMUNLOCKED, nr_pages);
-}
+#define LRU_PAGE 0x1
+#define NEW_PAGE 0x2
+#define mlock_lru(page) ((struct page *)((unsigned long)page + LRU_PAGE))
+#define mlock_new(page) ((struct page *)((unsigned long)page + NEW_PAGE))
-/**
- * munlock_vma_page - munlock a vma page
- * @page: page to be unlocked, either a normal page or THP page head
- *
- * returns the size of the page as a page mask (0 for normal page,
- * HPAGE_PMD_NR - 1 for THP head page)
- *
- * called from munlock()/munmap() path with page supposedly on the LRU.
- * When we munlock a page, because the vma where we found the page is being
- * munlock()ed or munmap()ed, we want to check whether other vmas hold the
- * page locked so that we can leave it on the unevictable lru list and not
- * bother vmscan with it. However, to walk the page's rmap list in
- * page_mlock() we must isolate the page from the LRU. If some other
- * task has removed the page from the LRU, we won't be able to do that.
- * So we clear the PageMlocked as we might not get another chance. If we
- * can't isolate the page, we leave it for putback_lru_page() and vmscan
- * [page_referenced()/try_to_unmap()] to deal with.
+/*
+ * mlock_pagevec() is derived from pagevec_lru_move_fn():
+ * perhaps that can make use of such page pointer flags in future,
+ * but for now just keep it for mlock. We could use three separate
+ * pagevecs instead, but one feels better (munlocking a full pagevec
+ * does not need to drain mlocking pagevecs first).
*/
-unsigned int munlock_vma_page(struct page *page)
+static void mlock_pagevec(struct pagevec *pvec)
{
- int nr_pages;
-
- /* For page_mlock() and to serialize with page migration */
- BUG_ON(!PageLocked(page));
- VM_BUG_ON_PAGE(PageTail(page), page);
+ struct lruvec *lruvec = NULL;
+ unsigned long mlock;
+ struct page *page;
+ int i;
- if (!TestClearPageMlocked(page)) {
- /* Potentially, PTE-mapped THP: do not skip the rest PTEs */
- return 0;
+ for (i = 0; i < pagevec_count(pvec); i++) {
+ page = pvec->pages[i];
+ mlock = (unsigned long)page & (LRU_PAGE | NEW_PAGE);
+ page = (struct page *)((unsigned long)page - mlock);
+ pvec->pages[i] = page;
+
+ if (mlock & LRU_PAGE)
+ lruvec = __mlock_page(page, lruvec);
+ else if (mlock & NEW_PAGE)
+ lruvec = __mlock_new_page(page, lruvec);
+ else
+ lruvec = __munlock_page(page, lruvec);
}
- nr_pages = thp_nr_pages(page);
- mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);
+ if (lruvec)
+ unlock_page_lruvec_irq(lruvec);
+ release_pages(pvec->pages, pvec->nr);
+ pagevec_reinit(pvec);
+}
- if (!isolate_lru_page(page))
- __munlock_isolated_page(page);
- else
- __munlock_isolation_failed(page);
+void mlock_page_drain(int cpu)
+{
+ struct pagevec *pvec;
- return nr_pages - 1;
+ pvec = &per_cpu(mlock_pvec, cpu);
+ if (pagevec_count(pvec))
+ mlock_pagevec(pvec);
}
-/*
- * convert get_user_pages() return value to posix mlock() error
- */
-static int __mlock_posix_error_return(long retval)
+bool need_mlock_page_drain(int cpu)
{
- if (retval == -EFAULT)
- retval = -ENOMEM;
- else if (retval == -ENOMEM)
- retval = -EAGAIN;
- return retval;
+ return pagevec_count(&per_cpu(mlock_pvec, cpu));
}
-/*
- * Prepare page for fast batched LRU putback via putback_lru_evictable_pagevec()
- *
- * The fast path is available only for evictable pages with single mapping.
- * Then we can bypass the per-cpu pvec and get better performance.
- * when mapcount > 1 we need page_mlock() which can fail.
- * when !page_evictable(), we need the full redo logic of putback_lru_page to
- * avoid leaving evictable page in unevictable list.
- *
- * In case of success, @page is added to @pvec and @pgrescued is incremented
- * in case that the page was previously unevictable. @page is also unlocked.
+/**
+ * mlock_page - mlock a page already on (or temporarily off) LRU
+ * @page - page to be mlocked, either a normal page or a THP head.
*/
-static bool __putback_lru_fast_prepare(struct page *page, struct pagevec *pvec,
- int *pgrescued)
+void mlock_page(struct page *page)
{
- VM_BUG_ON_PAGE(PageLRU(page), page);
- VM_BUG_ON_PAGE(!PageLocked(page), page);
+ struct pagevec *pvec = &get_cpu_var(mlock_pvec);
- if (page_mapcount(page) <= 1 && page_evictable(page)) {
- pagevec_add(pvec, page);
- if (TestClearPageUnevictable(page))
- (*pgrescued)++;
- unlock_page(page);
- return true;
+ if (!TestSetPageMlocked(page)) {
+ int nr_pages = thp_nr_pages(page);
+
+ mod_zone_page_state(page_zone(page), NR_MLOCK, nr_pages);
+ __count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages);
}
- return false;
+ get_page(page);
+ if (!pagevec_add(pvec, mlock_lru(page)) ||
+ PageHead(page) || lru_cache_disabled())
+ mlock_pagevec(pvec);
+ put_cpu_var(mlock_pvec);
}
-/*
- * Putback multiple evictable pages to the LRU
- *
- * Batched putback of evictable pages that bypasses the per-cpu pvec. Some of
- * the pages might have meanwhile become unevictable but that is OK.
+/**
+ * mlock_new_page - mlock a newly allocated page not yet on LRU
+ * @page - page to be mlocked, either a normal page or a THP head.
*/
-static void __putback_lru_fast(struct pagevec *pvec, int pgrescued)
+void mlock_new_page(struct page *page)
{
- count_vm_events(UNEVICTABLE_PGMUNLOCKED, pagevec_count(pvec));
- /*
- *__pagevec_lru_add() calls release_pages() so we don't call
- * put_page() explicitly
- */
- __pagevec_lru_add(pvec);
- count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued);
+ struct pagevec *pvec = &get_cpu_var(mlock_pvec);
+ int nr_pages = thp_nr_pages(page);
+
+ SetPageMlocked(page);
+ mod_zone_page_state(page_zone(page), NR_MLOCK, nr_pages);
+ __count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages);
+
+ get_page(page);
+ if (!pagevec_add(pvec, mlock_new(page)) ||
+ PageHead(page) || lru_cache_disabled())
+ mlock_pagevec(pvec);
+ put_cpu_var(mlock_pvec);
}
-/*
- * Munlock a batch of pages from the same zone
- *
- * The work is split to two main phases. First phase clears the Mlocked flag
- * and attempts to isolate the pages, all under a single zone lru lock.
- * The second phase finishes the munlock only for pages where isolation
- * succeeded.
- *
- * Note that the pagevec may be modified during the process.
+/**
+ * munlock_page - munlock a page
+ * @page - page to be munlocked, either a normal page or a THP head.
*/
-static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
+void munlock_page(struct page *page)
{
- int i;
- int nr = pagevec_count(pvec);
- int delta_munlocked = -nr;
- struct pagevec pvec_putback;
- struct lruvec *lruvec = NULL;
- int pgrescued = 0;
-
- pagevec_init(&pvec_putback);
-
- /* Phase 1: page isolation */
- for (i = 0; i < nr; i++) {
- struct page *page = pvec->pages[i];
- struct folio *folio = page_folio(page);
-
- if (TestClearPageMlocked(page)) {
- /*
- * We already have pin from follow_page_mask()
- * so we can spare the get_page() here.
- */
- if (TestClearPageLRU(page)) {
- lruvec = folio_lruvec_relock_irq(folio, lruvec);
- del_page_from_lru_list(page, lruvec);
- continue;
- } else
- __munlock_isolation_failed(page);
- } else {
- delta_munlocked++;
- }
-
- /*
- * We won't be munlocking this page in the next phase
- * but we still need to release the follow_page_mask()
- * pin. We cannot do it under lru_lock however. If it's
- * the last pin, __page_cache_release() would deadlock.
- */
- pagevec_add(&pvec_putback, pvec->pages[i]);
- pvec->pages[i] = NULL;
- }
- if (lruvec) {
- __mod_zone_page_state(zone, NR_MLOCK, delta_munlocked);
- unlock_page_lruvec_irq(lruvec);
- } else if (delta_munlocked) {
- mod_zone_page_state(zone, NR_MLOCK, delta_munlocked);
- }
-
- /* Now we can release pins of pages that we are not munlocking */
- pagevec_release(&pvec_putback);
-
- /* Phase 2: page munlock */
- for (i = 0; i < nr; i++) {
- struct page *page = pvec->pages[i];
-
- if (page) {
- lock_page(page);
- if (!__putback_lru_fast_prepare(page, &pvec_putback,
- &pgrescued)) {
- /*
- * Slow path. We don't want to lose the last
- * pin before unlock_page()
- */
- get_page(page); /* for putback_lru_page() */
- __munlock_isolated_page(page);
- unlock_page(page);
- put_page(page); /* from follow_page_mask() */
- }
- }
- }
+ struct pagevec *pvec = &get_cpu_var(mlock_pvec);
/*
- * Phase 3: page putback for pages that qualified for the fast path
- * This will also call put_page() to return pin from follow_page_mask()
+ * TestClearPageMlocked(page) must be left to __munlock_page(),
+ * which will check whether the page is multiply mlocked.
*/
- if (pagevec_count(&pvec_putback))
- __putback_lru_fast(&pvec_putback, pgrescued);
+
+ get_page(page);
+ if (!pagevec_add(pvec, page) ||
+ PageHead(page) || lru_cache_disabled())
+ mlock_pagevec(pvec);
+ put_cpu_var(mlock_pvec);
}
-/*
- * Fill up pagevec for __munlock_pagevec using pte walk
- *
- * The function expects that the struct page corresponding to @start address is
- * a non-TPH page already pinned and in the @pvec, and that it belongs to @zone.
- *
- * The rest of @pvec is filled by subsequent pages within the same pmd and same
- * zone, as long as the pte's are present and vm_normal_page() succeeds. These
- * pages also get pinned.
- *
- * Returns the address of the next page that should be scanned. This equals
- * @start + PAGE_SIZE when no page could be added by the pte walk.
- */
-static unsigned long __munlock_pagevec_fill(struct pagevec *pvec,
- struct vm_area_struct *vma, struct zone *zone,
- unsigned long start, unsigned long end)
+static int mlock_pte_range(pmd_t *pmd, unsigned long addr,
+ unsigned long end, struct mm_walk *walk)
+
{
- pte_t *pte;
+ struct vm_area_struct *vma = walk->vma;
spinlock_t *ptl;
+ pte_t *start_pte, *pte;
+ struct page *page;
- /*
- * Initialize pte walk starting at the already pinned page where we
- * are sure that there is a pte, as it was pinned under the same
- * mmap_lock write op.
- */
- pte = get_locked_pte(vma->vm_mm, start, &ptl);
- /* Make sure we do not cross the page table boundary */
- end = pgd_addr_end(start, end);
- end = p4d_addr_end(start, end);
- end = pud_addr_end(start, end);
- end = pmd_addr_end(start, end);
-
- /* The page next to the pinned page is the first we will try to get */
- start += PAGE_SIZE;
- while (start < end) {
- struct page *page = NULL;
- pte++;
- if (pte_present(*pte))
- page = vm_normal_page(vma, start, *pte);
- /*
- * Break if page could not be obtained or the page's node+zone does not
- * match
- */
- if (!page || page_zone(page) != zone)
- break;
+ ptl = pmd_trans_huge_lock(pmd, vma);
+ if (ptl) {
+ if (!pmd_present(*pmd))
+ goto out;
+ if (is_huge_zero_pmd(*pmd))
+ goto out;
+ page = pmd_page(*pmd);
+ if (vma->vm_flags & VM_LOCKED)
+ mlock_page(page);
+ else
+ munlock_page(page);
+ goto out;
+ }
- /*
- * Do not use pagevec for PTE-mapped THP,
- * munlock_vma_pages_range() will handle them.
- */
+ start_pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+ for (pte = start_pte; addr != end; pte++, addr += PAGE_SIZE) {
+ if (!pte_present(*pte))
+ continue;
+ page = vm_normal_page(vma, addr, *pte);
+ if (!page)
+ continue;
if (PageTransCompound(page))
- break;
-
- get_page(page);
- /*
- * Increase the address that will be returned *before* the
- * eventual break due to pvec becoming full by adding the page
- */
- start += PAGE_SIZE;
- if (pagevec_add(pvec, page) == 0)
- break;
+ continue;
+ if (vma->vm_flags & VM_LOCKED)
+ mlock_page(page);
+ else
+ munlock_page(page);
}
- pte_unmap_unlock(pte, ptl);
- return start;
+ pte_unmap(start_pte);
+out:
+ spin_unlock(ptl);
+ cond_resched();
+ return 0;
}
/*
- * munlock_vma_pages_range() - munlock all pages in the vma range.'
- * @vma - vma containing range to be munlock()ed.
+ * mlock_vma_pages_range() - mlock any pages already in the range,
+ * or munlock all pages in the range.
+ * @vma - vma containing range to be mlock()ed or munlock()ed
* @start - start address in @vma of the range
- * @end - end of range in @vma.
- *
- * For mremap(), munmap() and exit().
- *
- * Called with @vma VM_LOCKED.
+ * @end - end of range in @vma
+ * @newflags - the new set of flags for @vma.
*
- * Returns with VM_LOCKED cleared. Callers must be prepared to
- * deal with this.
- *
- * We don't save and restore VM_LOCKED here because pages are
- * still on lru. In unmap path, pages might be scanned by reclaim
- * and re-mlocked by page_mlock/try_to_unmap before we unmap and
- * free them. This will result in freeing mlocked pages.
+ * Called for mlock(), mlock2() and mlockall(), to set @vma VM_LOCKED;
+ * called for munlock() and munlockall(), to clear VM_LOCKED from @vma.
*/
-void munlock_vma_pages_range(struct vm_area_struct *vma,
- unsigned long start, unsigned long end)
+static void mlock_vma_pages_range(struct vm_area_struct *vma,
+ unsigned long start, unsigned long end, vm_flags_t newflags)
{
- vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
+ static const struct mm_walk_ops mlock_walk_ops = {
+ .pmd_entry = mlock_pte_range,
+ };
- while (start < end) {
- struct page *page;
- unsigned int page_mask = 0;
- unsigned long page_increm;
- struct pagevec pvec;
- struct zone *zone;
+ /*
+ * There is a slight chance that concurrent page migration,
+ * or page reclaim finding a page of this now-VM_LOCKED vma,
+ * will call mlock_vma_page() and raise page's mlock_count:
+ * double counting, leaving the page unevictable indefinitely.
+ * Communicate this danger to mlock_vma_page() with VM_IO,
+ * which is a VM_SPECIAL flag not allowed on VM_LOCKED vmas.
+ * mmap_lock is held in write mode here, so this weird
+ * combination should not be visible to others.
+ */
+ if (newflags & VM_LOCKED)
+ newflags |= VM_IO;
+ WRITE_ONCE(vma->vm_flags, newflags);
- pagevec_init(&pvec);
- /*
- * Although FOLL_DUMP is intended for get_dump_page(),
- * it just so happens that its special treatment of the
- * ZERO_PAGE (returning an error instead of doing get_page)
- * suits munlock very well (and if somehow an abnormal page
- * has sneaked into the range, we won't oops here: great).
- */
- page = follow_page(vma, start, FOLL_GET | FOLL_DUMP);
-
- if (page && !IS_ERR(page)) {
- if (PageTransTail(page)) {
- VM_BUG_ON_PAGE(PageMlocked(page), page);
- put_page(page); /* follow_page_mask() */
- } else if (PageTransHuge(page)) {
- lock_page(page);
- /*
- * Any THP page found by follow_page_mask() may
- * have gotten split before reaching
- * munlock_vma_page(), so we need to compute
- * the page_mask here instead.
- */
- page_mask = munlock_vma_page(page);
- unlock_page(page);
- put_page(page); /* follow_page_mask() */
- } else {
- /*
- * Non-huge pages are handled in batches via
- * pagevec. The pin from follow_page_mask()
- * prevents them from collapsing by THP.
- */
- pagevec_add(&pvec, page);
- zone = page_zone(page);
-
- /*
- * Try to fill the rest of pagevec using fast
- * pte walk. This will also update start to
- * the next page to process. Then munlock the
- * pagevec.
- */
- start = __munlock_pagevec_fill(&pvec, vma,
- zone, start, end);
- __munlock_pagevec(&pvec, zone);
- goto next;
- }
- }
- page_increm = 1 + page_mask;
- start += page_increm * PAGE_SIZE;
-next:
- cond_resched();
+ lru_add_drain();
+ walk_page_range(vma->vm_mm, start, end, &mlock_walk_ops, NULL);
+ lru_add_drain();
+
+ if (newflags & VM_IO) {
+ newflags &= ~VM_IO;
+ WRITE_ONCE(vma->vm_flags, newflags);
}
}
@@ -500,8 +374,7 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
pgoff_t pgoff;
int nr_pages;
int ret = 0;
- int lock = !!(newflags & VM_LOCKED);
- vm_flags_t old_flags = vma->vm_flags;
+ vm_flags_t oldflags = vma->vm_flags;
if (newflags == vma->vm_flags || (vma->vm_flags & VM_SPECIAL) ||
is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm) ||
@@ -535,9 +408,9 @@ success:
* Keep track of amount of locked VM.
*/
nr_pages = (end - start) >> PAGE_SHIFT;
- if (!lock)
+ if (!(newflags & VM_LOCKED))
nr_pages = -nr_pages;
- else if (old_flags & VM_LOCKED)
+ else if (oldflags & VM_LOCKED)
nr_pages = 0;
mm->locked_vm += nr_pages;
@@ -547,11 +420,12 @@ success:
* set VM_LOCKED, populate_vma_page_range will bring it back.
*/
- if (lock)
+ if ((newflags & VM_LOCKED) && (oldflags & VM_LOCKED)) {
+ /* No work to do, and mlocking twice would be wrong */
vma->vm_flags = newflags;
- else
- munlock_vma_pages_range(vma, start, end);
-
+ } else {
+ mlock_vma_pages_range(vma, start, end, newflags);
+ }
out:
*prev = vma;
return ret;
@@ -645,6 +519,18 @@ static unsigned long count_mm_mlocked_page_nr(struct mm_struct *mm,
return count >> PAGE_SHIFT;
}
+/*
+ * convert get_user_pages() return value to posix mlock() error
+ */
+static int __mlock_posix_error_return(long retval)
+{
+ if (retval == -EFAULT)
+ retval = -ENOMEM;
+ else if (retval == -ENOMEM)
+ retval = -EAGAIN;
+ return retval;
+}
+
static __must_check int do_mlock(unsigned long start, size_t len, vm_flags_t flags)
{
unsigned long locked;
diff --git a/mm/mmap.c b/mm/mmap.c
index 1e8fdb0b51edd..64b5985b5295c 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2674,6 +2674,8 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
vma->vm_prev = NULL;
do {
vma_rb_erase(vma, &mm->mm_rb);
+ if (vma->vm_flags & VM_LOCKED)
+ mm->locked_vm -= vma_pages(vma);
mm->map_count--;
tail_vma = vma;
vma = vma->vm_next;
@@ -2778,22 +2780,6 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
return __split_vma(mm, vma, addr, new_below);
}
-static inline void
-unlock_range(struct vm_area_struct *start, unsigned long limit)
-{
- struct mm_struct *mm = start->vm_mm;
- struct vm_area_struct *tmp = start;
-
- while (tmp && tmp->vm_start < limit) {
- if (tmp->vm_flags & VM_LOCKED) {
- mm->locked_vm -= vma_pages(tmp);
- munlock_vma_pages_all(tmp);
- }
-
- tmp = tmp->vm_next;
- }
-}
-
/* Munmap is split into 2 main parts -- this part which finds
* what needs doing, and the areas themselves, which do the
* work. This now handles partial unmappings.
@@ -2874,12 +2860,6 @@ int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
return error;
}
- /*
- * unlock any mlock()ed ranges before detaching vmas
- */
- if (mm->locked_vm)
- unlock_range(vma, end);
-
/* Detach vmas from rbtree */
if (!detach_vmas_to_be_unmapped(mm, vma, prev, end))
downgrade = false;
@@ -3147,20 +3127,12 @@ void exit_mmap(struct mm_struct *mm)
* Nothing can be holding mm->mmap_lock here and the above call
* to mmu_notifier_release(mm) ensures mmu notifier callbacks in
* __oom_reap_task_mm() will not block.
- *
- * This needs to be done before calling unlock_range(),
- * which clears VM_LOCKED, otherwise the oom reaper cannot
- * reliably test it.
*/
(void)__oom_reap_task_mm(mm);
-
set_bit(MMF_OOM_SKIP, &mm->flags);
}
mmap_write_lock(mm);
- if (mm->locked_vm)
- unlock_range(mm->mmap, ULONG_MAX);
-
arch_exit_mmap(mm);
vma = mm->mmap;
diff --git a/mm/mmzone.c b/mm/mmzone.c
index eb89d6e018e29..0ae7571e35abb 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -81,6 +81,13 @@ void lruvec_init(struct lruvec *lruvec)
for_each_lru(lru)
INIT_LIST_HEAD(&lruvec->lists[lru]);
+ /*
+ * The "Unevictable LRU" is imaginary: though its size is maintained,
+ * it is never scanned, and unevictable pages are not threaded on it
+ * (so that their lru fields can be reused to hold mlock_count).
+ * Poison its list head, so that any operations on it would crash.
+ */
+ list_del(&lruvec->lists[LRU_UNEVICTABLE]);
}
#if defined(CONFIG_NUMA_BALANCING) && !defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS)
@@ -89,13 +96,14 @@ int page_cpupid_xchg_last(struct page *page, int cpupid)
unsigned long old_flags, flags;
int last_cpupid;
+ old_flags = READ_ONCE(page->flags);
do {
- old_flags = flags = page->flags;
- last_cpupid = page_cpupid_last(page);
+ flags = old_flags;
+ last_cpupid = (flags >> LAST_CPUPID_PGSHIFT) & LAST_CPUPID_MASK;
flags &= ~(LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT);
flags |= (cpupid & LAST_CPUPID_MASK) << LAST_CPUPID_PGSHIFT;
- } while (unlikely(cmpxchg(&page->flags, old_flags, flags) != old_flags));
+ } while (unlikely(!try_cmpxchg(&page->flags, &old_flags, flags)));
return last_cpupid;
}
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 0138dfcdb1d80..2fe03e695c81c 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -29,6 +29,7 @@
#include <linux/uaccess.h>
#include <linux/mm_inline.h>
#include <linux/pgtable.h>
+#include <linux/sched/sysctl.h>
#include <asm/cacheflush.h>
#include <asm/mmu_context.h>
#include <asm/tlbflush.h>
@@ -83,6 +84,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
*/
if (prot_numa) {
struct page *page;
+ int nid;
/* Avoid TLB flush if possible */
if (pte_protnone(oldpte))
@@ -109,7 +111,16 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
* Don't mess with PTEs if page is already on the node
* a single-threaded process is running on.
*/
- if (target_node == page_to_nid(page))
+ nid = page_to_nid(page);
+ if (target_node == nid)
+ continue;
+
+ /*
+ * Skip scanning top tier node if normal numa
+ * balancing is disabled
+ */
+ if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) &&
+ node_is_toptier(nid))
continue;
}
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 832fb330376ef..6b875acabd1e7 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -526,7 +526,7 @@ bool __oom_reap_task_mm(struct mm_struct *mm)
set_bit(MMF_UNSTABLE, &mm->flags);
for (vma = mm->mmap ; vma; vma = vma->vm_next) {
- if (!can_madv_lru_vma(vma))
+ if (vma->vm_flags & (VM_HUGETLB|VM_PFNMAP))
continue;
/*
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 91d163f8d36b2..48a8cf770e3f4 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -70,30 +70,33 @@ static long ratelimit_pages = 32;
/*
* Start background writeback (via writeback threads) at this percentage
*/
-int dirty_background_ratio = 10;
+static int dirty_background_ratio = 10;
/*
* dirty_background_bytes starts at 0 (disabled) so that it is a function of
* dirty_background_ratio * the amount of dirtyable memory
*/
-unsigned long dirty_background_bytes;
+static unsigned long dirty_background_bytes;
/*
* free highmem will not be subtracted from the total free memory
* for calculating free ratios if vm_highmem_is_dirtyable is true
*/
-int vm_highmem_is_dirtyable;
+static int vm_highmem_is_dirtyable;
/*
* The generator of dirty data starts writeback at this percentage
*/
-int vm_dirty_ratio = 20;
+static int vm_dirty_ratio = 20;
+
+/* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */
+static const unsigned long dirty_bytes_min = 2 * PAGE_SIZE;
/*
* vm_dirty_bytes starts at 0 (disabled) so that it is a function of
* vm_dirty_ratio * the amount of dirtyable memory
*/
-unsigned long vm_dirty_bytes;
+static unsigned long vm_dirty_bytes;
/*
* The interval between `kupdate'-style writebacks
@@ -503,7 +506,8 @@ bool node_dirty_ok(struct pglist_data *pgdat)
return nr_pages <= limit;
}
-int dirty_background_ratio_handler(struct ctl_table *table, int write,
+#ifdef CONFIG_SYSCTL
+static int dirty_background_ratio_handler(struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
int ret;
@@ -514,7 +518,7 @@ int dirty_background_ratio_handler(struct ctl_table *table, int write,
return ret;
}
-int dirty_background_bytes_handler(struct ctl_table *table, int write,
+static int dirty_background_bytes_handler(struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
int ret;
@@ -525,7 +529,7 @@ int dirty_background_bytes_handler(struct ctl_table *table, int write,
return ret;
}
-int dirty_ratio_handler(struct ctl_table *table, int write, void *buffer,
+static int dirty_ratio_handler(struct ctl_table *table, int write, void *buffer,
size_t *lenp, loff_t *ppos)
{
int old_ratio = vm_dirty_ratio;
@@ -539,7 +543,7 @@ int dirty_ratio_handler(struct ctl_table *table, int write, void *buffer,
return ret;
}
-int dirty_bytes_handler(struct ctl_table *table, int write,
+static int dirty_bytes_handler(struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
unsigned long old_bytes = vm_dirty_bytes;
@@ -552,6 +556,7 @@ int dirty_bytes_handler(struct ctl_table *table, int write,
}
return ret;
}
+#endif
static unsigned long wp_next_time(unsigned long cur_time)
{
@@ -1993,10 +1998,11 @@ bool wb_over_bg_thresh(struct bdi_writeback *wb)
return false;
}
+#ifdef CONFIG_SYSCTL
/*
* sysctl handler for /proc/sys/vm/dirty_writeback_centisecs
*/
-int dirty_writeback_centisecs_handler(struct ctl_table *table, int write,
+static int dirty_writeback_centisecs_handler(struct ctl_table *table, int write,
void *buffer, size_t *length, loff_t *ppos)
{
unsigned int old_interval = dirty_writeback_interval;
@@ -2017,6 +2023,7 @@ int dirty_writeback_centisecs_handler(struct ctl_table *table, int write,
return ret;
}
+#endif
void laptop_mode_timer_fn(struct timer_list *t)
{
@@ -2081,6 +2088,79 @@ static int page_writeback_cpu_online(unsigned int cpu)
return 0;
}
+#ifdef CONFIG_SYSCTL
+static struct ctl_table vm_page_writeback_sysctls[] = {
+ {
+ .procname = "dirty_background_ratio",
+ .data = &dirty_background_ratio,
+ .maxlen = sizeof(dirty_background_ratio),
+ .mode = 0644,
+ .proc_handler = dirty_background_ratio_handler,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE_HUNDRED,
+ },
+ {
+ .procname = "dirty_background_bytes",
+ .data = &dirty_background_bytes,
+ .maxlen = sizeof(dirty_background_bytes),
+ .mode = 0644,
+ .proc_handler = dirty_background_bytes_handler,
+ .extra1 = SYSCTL_LONG_ONE,
+ },
+ {
+ .procname = "dirty_ratio",
+ .data = &vm_dirty_ratio,
+ .maxlen = sizeof(vm_dirty_ratio),
+ .mode = 0644,
+ .proc_handler = dirty_ratio_handler,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE_HUNDRED,
+ },
+ {
+ .procname = "dirty_bytes",
+ .data = &vm_dirty_bytes,
+ .maxlen = sizeof(vm_dirty_bytes),
+ .mode = 0644,
+ .proc_handler = dirty_bytes_handler,
+ .extra1 = (void *)&dirty_bytes_min,
+ },
+ {
+ .procname = "dirty_writeback_centisecs",
+ .data = &dirty_writeback_interval,
+ .maxlen = sizeof(dirty_writeback_interval),
+ .mode = 0644,
+ .proc_handler = dirty_writeback_centisecs_handler,
+ },
+ {
+ .procname = "dirty_expire_centisecs",
+ .data = &dirty_expire_interval,
+ .maxlen = sizeof(dirty_expire_interval),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ },
+#ifdef CONFIG_HIGHMEM
+ {
+ .procname = "highmem_is_dirtyable",
+ .data = &vm_highmem_is_dirtyable,
+ .maxlen = sizeof(vm_highmem_is_dirtyable),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+#endif
+ {
+ .procname = "laptop_mode",
+ .data = &laptop_mode,
+ .maxlen = sizeof(laptop_mode),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {}
+};
+#endif
+
/*
* Called early on to tune the page writeback dirty limits.
*
@@ -2105,6 +2185,9 @@ void __init page_writeback_init(void)
page_writeback_cpu_online, NULL);
cpuhp_setup_state(CPUHP_MM_WRITEBACK_DEAD, "mm/writeback:dead", NULL,
page_writeback_cpu_online);
+#ifdef CONFIG_SYSCTL
+ register_sysctl_init("vm", vm_page_writeback_sysctls);
+#endif
}
/**
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 3589febc6d319..cface1d380937 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -378,25 +378,9 @@ int page_group_by_mobility_disabled __read_mostly;
*/
static DEFINE_STATIC_KEY_TRUE(deferred_pages);
-/*
- * Calling kasan_poison_pages() only after deferred memory initialization
- * has completed. Poisoning pages during deferred memory init will greatly
- * lengthen the process and cause problem in large memory systems as the
- * deferred pages initialization is done with interrupt disabled.
- *
- * Assuming that there will be no reference to those newly initialized
- * pages before they are ever allocated, this should have no effect on
- * KASAN memory tracking as the poison will be properly inserted at page
- * allocation time. The only corner case is when pages are allocated by
- * on-demand allocation and then freed again before the deferred pages
- * initialization is done, but this is not likely to happen.
- */
-static inline bool should_skip_kasan_poison(struct page *page, fpi_t fpi_flags)
+static inline bool deferred_pages_enabled(void)
{
- return static_branch_unlikely(&deferred_pages) ||
- (!IS_ENABLED(CONFIG_KASAN_GENERIC) &&
- (fpi_flags & FPI_SKIP_KASAN_POISON)) ||
- PageSkipKASanPoison(page);
+ return static_branch_unlikely(&deferred_pages);
}
/* Returns true if the struct page for the pfn is uninitialised */
@@ -447,11 +431,9 @@ defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
return false;
}
#else
-static inline bool should_skip_kasan_poison(struct page *page, fpi_t fpi_flags)
+static inline bool deferred_pages_enabled(void)
{
- return (!IS_ENABLED(CONFIG_KASAN_GENERIC) &&
- (fpi_flags & FPI_SKIP_KASAN_POISON)) ||
- PageSkipKASanPoison(page);
+ return false;
}
static inline bool early_page_uninitialised(unsigned long pfn)
@@ -1117,25 +1099,24 @@ continue_merging:
}
if (order < MAX_ORDER - 1) {
/* If we are here, it means order is >= pageblock_order.
- * We want to prevent merge between freepages on isolate
- * pageblock and normal pageblock. Without this, pageblock
- * isolation could cause incorrect freepage or CMA accounting.
+ * We want to prevent merge between freepages on pageblock
+ * without fallbacks and normal pageblock. Without this,
+ * pageblock isolation could cause incorrect freepage or CMA
+ * accounting or HIGHATOMIC accounting.
*
* We don't want to hit this code for the more frequent
* low-order merging.
*/
- if (unlikely(has_isolate_pageblock(zone))) {
- int buddy_mt;
+ int buddy_mt;
- buddy_pfn = __find_buddy_pfn(pfn, order);
- buddy = page + (buddy_pfn - pfn);
- buddy_mt = get_pageblock_migratetype(buddy);
+ buddy_pfn = __find_buddy_pfn(pfn, order);
+ buddy = page + (buddy_pfn - pfn);
+ buddy_mt = get_pageblock_migratetype(buddy);
- if (migratetype != buddy_mt
- && (is_migrate_isolate(migratetype) ||
- is_migrate_isolate(buddy_mt)))
- goto done_merging;
- }
+ if (migratetype != buddy_mt
+ && (!migratetype_is_mergeable(migratetype) ||
+ !migratetype_is_mergeable(buddy_mt)))
+ goto done_merging;
max_order = order + 1;
goto continue_merging;
}
@@ -1271,15 +1252,38 @@ out:
return ret;
}
-static void kernel_init_free_pages(struct page *page, int numpages, bool zero_tags)
+/*
+ * Skip KASAN memory poisoning when either:
+ *
+ * 1. Deferred memory initialization has not yet completed,
+ * see the explanation below.
+ * 2. Skipping poisoning is requested via FPI_SKIP_KASAN_POISON,
+ * see the comment next to it.
+ * 3. Skipping poisoning is requested via __GFP_SKIP_KASAN_POISON,
+ * see the comment next to it.
+ *
+ * Poisoning pages during deferred memory init will greatly lengthen the
+ * process and cause problem in large memory systems as the deferred pages
+ * initialization is done with interrupt disabled.
+ *
+ * Assuming that there will be no reference to those newly initialized
+ * pages before they are ever allocated, this should have no effect on
+ * KASAN memory tracking as the poison will be properly inserted at page
+ * allocation time. The only corner case is when pages are allocated by
+ * on-demand allocation and then freed again before the deferred pages
+ * initialization is done, but this is not likely to happen.
+ */
+static inline bool should_skip_kasan_poison(struct page *page, fpi_t fpi_flags)
{
- int i;
+ return deferred_pages_enabled() ||
+ (!IS_ENABLED(CONFIG_KASAN_GENERIC) &&
+ (fpi_flags & FPI_SKIP_KASAN_POISON)) ||
+ PageSkipKASanPoison(page);
+}
- if (zero_tags) {
- for (i = 0; i < numpages; i++)
- tag_clear_highpage(page + i);
- return;
- }
+static void kernel_init_free_pages(struct page *page, int numpages)
+{
+ int i;
/* s390's use of memset() could override KASAN redzones. */
kasan_disable_current();
@@ -1296,7 +1300,7 @@ static __always_inline bool free_pages_prepare(struct page *page,
unsigned int order, bool check_free, fpi_t fpi_flags)
{
int bad = 0;
- bool skip_kasan_poison = should_skip_kasan_poison(page, fpi_flags);
+ bool init = want_init_on_free();
VM_BUG_ON_PAGE(PageTail(page), page);
@@ -1363,23 +1367,21 @@ static __always_inline bool free_pages_prepare(struct page *page,
/*
* As memory initialization might be integrated into KASAN,
- * kasan_free_pages and kernel_init_free_pages must be
+ * KASAN poisoning and memory initialization code must be
* kept together to avoid discrepancies in behavior.
*
* With hardware tag-based KASAN, memory tags must be set before the
* page becomes unavailable via debug_pagealloc or arch_free_page.
*/
- if (kasan_has_integrated_init()) {
- if (!skip_kasan_poison)
- kasan_free_pages(page, order);
- } else {
- bool init = want_init_on_free();
+ if (!should_skip_kasan_poison(page, fpi_flags)) {
+ kasan_poison_pages(page, order, init);
- if (init)
- kernel_init_free_pages(page, 1 << order, false);
- if (!skip_kasan_poison)
- kasan_poison_pages(page, order, init);
+ /* Memory is already initialized if KASAN did it internally. */
+ if (kasan_has_integrated_init())
+ init = false;
}
+ if (init)
+ kernel_init_free_pages(page, 1 << order);
/*
* arch_free_page() can make the page's contents inaccessible. s390
@@ -2392,9 +2394,43 @@ static bool check_new_pages(struct page *page, unsigned int order)
return false;
}
+static inline bool should_skip_kasan_unpoison(gfp_t flags, bool init_tags)
+{
+ /* Don't skip if a software KASAN mode is enabled. */
+ if (IS_ENABLED(CONFIG_KASAN_GENERIC) ||
+ IS_ENABLED(CONFIG_KASAN_SW_TAGS))
+ return false;
+
+ /* Skip, if hardware tag-based KASAN is not enabled. */
+ if (!kasan_hw_tags_enabled())
+ return true;
+
+ /*
+ * With hardware tag-based KASAN enabled, skip if either:
+ *
+ * 1. Memory tags have already been cleared via tag_clear_highpage().
+ * 2. Skipping has been requested via __GFP_SKIP_KASAN_UNPOISON.
+ */
+ return init_tags || (flags & __GFP_SKIP_KASAN_UNPOISON);
+}
+
+static inline bool should_skip_init(gfp_t flags)
+{
+ /* Don't skip, if hardware tag-based KASAN is not enabled. */
+ if (!kasan_hw_tags_enabled())
+ return false;
+
+ /* For hardware tag-based KASAN, skip if requested. */
+ return (flags & __GFP_SKIP_ZERO);
+}
+
inline void post_alloc_hook(struct page *page, unsigned int order,
gfp_t gfp_flags)
{
+ bool init = !want_init_on_free() && want_init_on_alloc(gfp_flags) &&
+ !should_skip_init(gfp_flags);
+ bool init_tags = init && (gfp_flags & __GFP_ZEROTAGS);
+
set_page_private(page, 0);
set_page_refcounted(page);
@@ -2410,19 +2446,38 @@ inline void post_alloc_hook(struct page *page, unsigned int order,
/*
* As memory initialization might be integrated into KASAN,
- * kasan_alloc_pages and kernel_init_free_pages must be
+ * KASAN unpoisoning and memory initializion code must be
* kept together to avoid discrepancies in behavior.
*/
- if (kasan_has_integrated_init()) {
- kasan_alloc_pages(page, order, gfp_flags);
- } else {
- bool init = !want_init_on_free() && want_init_on_alloc(gfp_flags);
+ /*
+ * If memory tags should be zeroed (which happens only when memory
+ * should be initialized as well).
+ */
+ if (init_tags) {
+ int i;
+
+ /* Initialize both memory and tags. */
+ for (i = 0; i != 1 << order; ++i)
+ tag_clear_highpage(page + i);
+
+ /* Note that memory is already initialized by the loop above. */
+ init = false;
+ }
+ if (!should_skip_kasan_unpoison(gfp_flags, init_tags)) {
+ /* Unpoison shadow memory or set memory tags. */
kasan_unpoison_pages(page, order, init);
- if (init)
- kernel_init_free_pages(page, 1 << order,
- gfp_flags & __GFP_ZEROTAGS);
+
+ /* Note that memory is already initialized by KASAN. */
+ if (kasan_has_integrated_init())
+ init = false;
}
+ /* If memory is still not initialized, do it now. */
+ if (init)
+ kernel_init_free_pages(page, 1 << order);
+ /* Propagate __GFP_SKIP_KASAN_POISON to page flags. */
+ if (kasan_hw_tags_enabled() && (gfp_flags & __GFP_SKIP_KASAN_POISON))
+ SetPageSkipKASanPoison(page);
set_page_owner(page, order, gfp_flags);
page_table_check_alloc(page, order);
@@ -2479,17 +2534,13 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
/*
* This array describes the order lists are fallen back to when
* the free lists for the desirable migrate type are depleted
+ *
+ * The other migratetypes do not have fallbacks.
*/
static int fallbacks[MIGRATE_TYPES][3] = {
[MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_TYPES },
[MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_TYPES },
[MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_TYPES },
-#ifdef CONFIG_CMA
- [MIGRATE_CMA] = { MIGRATE_TYPES }, /* Never used */
-#endif
-#ifdef CONFIG_MEMORY_ISOLATION
- [MIGRATE_ISOLATE] = { MIGRATE_TYPES }, /* Never used */
-#endif
};
#ifdef CONFIG_CMA
@@ -2795,8 +2846,8 @@ static void reserve_highatomic_pageblock(struct page *page, struct zone *zone,
/* Yoink! */
mt = get_pageblock_migratetype(page);
- if (!is_migrate_highatomic(mt) && !is_migrate_isolate(mt)
- && !is_migrate_cma(mt)) {
+ /* Only reserve normal pageblocks (i.e., they can merge with others) */
+ if (migratetype_is_mergeable(mt)) {
zone->nr_reserved_highatomic += pageblock_nr_pages;
set_pageblock_migratetype(page, MIGRATE_HIGHATOMIC);
move_freepages_block(zone, page, MIGRATE_HIGHATOMIC, NULL);
@@ -3371,8 +3422,8 @@ static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone)
return min(READ_ONCE(pcp->batch) << 2, high);
}
-static void free_unref_page_commit(struct page *page, unsigned long pfn,
- int migratetype, unsigned int order)
+static void free_unref_page_commit(struct page *page, int migratetype,
+ unsigned int order)
{
struct zone *zone = page_zone(page);
struct per_cpu_pages *pcp;
@@ -3421,7 +3472,7 @@ void free_unref_page(struct page *page, unsigned int order)
}
local_lock_irqsave(&pagesets.lock, flags);
- free_unref_page_commit(page, pfn, migratetype, order);
+ free_unref_page_commit(page, migratetype, order);
local_unlock_irqrestore(&pagesets.lock, flags);
}
@@ -3431,13 +3482,13 @@ void free_unref_page(struct page *page, unsigned int order)
void free_unref_page_list(struct list_head *list)
{
struct page *page, *next;
- unsigned long flags, pfn;
+ unsigned long flags;
int batch_count = 0;
int migratetype;
/* Prepare pages for freeing */
list_for_each_entry_safe(page, next, list, lru) {
- pfn = page_to_pfn(page);
+ unsigned long pfn = page_to_pfn(page);
if (!free_unref_page_prepare(page, pfn, 0)) {
list_del(&page->lru);
continue;
@@ -3453,15 +3504,10 @@ void free_unref_page_list(struct list_head *list)
free_one_page(page_zone(page), page, pfn, 0, migratetype, FPI_NONE);
continue;
}
-
- set_page_private(page, pfn);
}
local_lock_irqsave(&pagesets.lock, flags);
list_for_each_entry_safe(page, next, list, lru) {
- pfn = page_private(page);
- set_page_private(page, 0);
-
/*
* Non-isolated types over MIGRATE_PCPTYPES get added
* to the MIGRATE_MOVABLE pcp list.
@@ -3471,7 +3517,7 @@ void free_unref_page_list(struct list_head *list)
migratetype = MIGRATE_MOVABLE;
trace_mm_page_free_batched(page);
- free_unref_page_commit(page, pfn, migratetype, 0);
+ free_unref_page_commit(page, migratetype, 0);
/*
* Guard against excessive IRQ disabled times when we get
@@ -3545,8 +3591,11 @@ int __isolate_free_page(struct page *page, unsigned int order)
struct page *endpage = page + (1 << order) - 1;
for (; page < endpage; page += pageblock_nr_pages) {
int mt = get_pageblock_migratetype(page);
- if (!is_migrate_isolate(mt) && !is_migrate_cma(mt)
- && !is_migrate_highatomic(mt))
+ /*
+ * Only change normal pageblocks (i.e., they can merge
+ * with others)
+ */
+ if (migratetype_is_mergeable(mt))
set_pageblock_migratetype(page,
MIGRATE_MOVABLE);
}
@@ -3948,12 +3997,12 @@ static inline bool zone_watermark_fast(struct zone *z, unsigned int order,
free_pages))
return true;
/*
- * Ignore watermark boosting for GFP_ATOMIC order-0 allocations
+ * Ignore watermark boosting for GFP_HIGH order-0 allocations
* when checking the min watermark. The min watermark is the
* point where boosting is ignored so that kswapd is woken up
* when below the low watermark.
*/
- if (unlikely(!order && (gfp_mask & __GFP_ATOMIC) && z->watermark_boost
+ if (unlikely(!order && (alloc_flags & ALLOC_HARDER) && z->watermark_boost
&& ((alloc_flags & ALLOC_WMARK_MASK) == WMARK_MIN))) {
mark = z->_watermark[WMARK_MIN];
return __zone_watermark_ok(z, order, mark, highest_zoneidx,
@@ -4681,12 +4730,12 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
* The caller may dip into page reserves a bit more if the caller
* cannot run direct reclaim, or if the caller has realtime scheduling
* policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will
- * set both ALLOC_HARDER (__GFP_ATOMIC) and ALLOC_HIGH (__GFP_HIGH).
+ * set both ALLOC_HARDER (unless __GFP_NOMEMALLOC) and ALLOC_HIGH.
*/
alloc_flags |= (__force int)
(gfp_mask & (__GFP_HIGH | __GFP_KSWAPD_RECLAIM));
- if (gfp_mask & __GFP_ATOMIC) {
+ if (gfp_mask & __GFP_HIGH) {
/*
* Not worth trying to allocate harder for __GFP_NOMEMALLOC even
* if it can't schedule.
@@ -4879,14 +4928,6 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
unsigned int cpuset_mems_cookie;
int reserve_flags;
- /*
- * We also sanity check to catch abuse of atomic reserves being used by
- * callers that are not in atomic context.
- */
- if (WARN_ON_ONCE((gfp_mask & (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)) ==
- (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)))
- gfp_mask &= ~__GFP_ATOMIC;
-
retry_cpuset:
compaction_retries = 0;
no_progress_loops = 0;
@@ -6265,13 +6306,12 @@ static void build_thisnode_zonelists(pg_data_t *pgdat)
static void build_zonelists(pg_data_t *pgdat)
{
static int node_order[MAX_NUMNODES];
- int node, load, nr_nodes = 0;
+ int node, nr_nodes = 0;
nodemask_t used_mask = NODE_MASK_NONE;
int local_node, prev_node;
/* NUMA-aware ordering of nodes */
local_node = pgdat->node_id;
- load = nr_online_nodes;
prev_node = local_node;
memset(node_order, 0, sizeof(node_order));
@@ -6281,13 +6321,13 @@ static void build_zonelists(pg_data_t *pgdat)
* So adding penalty to the first node in same
* distance group to make it round-robin.
*/
- if (node_distance(local_node, node) !=
- node_distance(local_node, prev_node))
- node_load[node] += load;
+ if ((node_distance(local_node, node) !=
+ node_distance(local_node, prev_node)) ||
+ node == local_node)
+ node_load[node] += nr_online_nodes;
node_order[nr_nodes++] = node;
prev_node = node;
- load--;
}
build_zonelists_in_node_order(pgdat, node_order, nr_nodes);
@@ -6380,7 +6420,7 @@ static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonesta
#define BOOT_PAGESET_BATCH 1
static DEFINE_PER_CPU(struct per_cpu_pages, boot_pageset);
static DEFINE_PER_CPU(struct per_cpu_zonestat, boot_zonestats);
-static DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats);
+DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats);
static void __build_all_zonelists(void *data)
{
@@ -6402,7 +6442,11 @@ static void __build_all_zonelists(void *data)
if (self && !node_online(self->node_id)) {
build_zonelists(self);
} else {
- for_each_online_node(nid) {
+ /*
+ * All possible nodes have pgdat preallocated
+ * in free_area_init
+ */
+ for_each_node(nid) {
pg_data_t *pgdat = NODE_DATA(nid);
build_zonelists(pgdat);
@@ -7502,12 +7546,33 @@ static void __meminit zone_init_internals(struct zone *zone, enum zone_type idx,
* NOTE: this function is only called during memory hotplug
*/
#ifdef CONFIG_MEMORY_HOTPLUG
-void __ref free_area_init_core_hotplug(int nid)
+void __ref free_area_init_core_hotplug(struct pglist_data *pgdat)
{
+ int nid = pgdat->node_id;
enum zone_type z;
- pg_data_t *pgdat = NODE_DATA(nid);
+ int cpu;
pgdat_init_internals(pgdat);
+
+ if (pgdat->per_cpu_nodestats == &boot_nodestats)
+ pgdat->per_cpu_nodestats = alloc_percpu(struct per_cpu_nodestat);
+
+ /*
+ * Reset the nr_zones, order and highest_zoneidx before reuse.
+ * Note that kswapd will init kswapd_highest_zoneidx properly
+ * when it starts in the near future.
+ */
+ pgdat->nr_zones = 0;
+ pgdat->kswapd_order = 0;
+ pgdat->kswapd_highest_zoneidx = 0;
+ pgdat->node_start_pfn = 0;
+ for_each_online_cpu(cpu) {
+ struct per_cpu_nodestat *p;
+
+ p = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu);
+ memset(p, 0, sizeof(*p));
+ }
+
for (z = 0; z < MAX_NR_ZONES; z++)
zone_init_internals(&pgdat->node_zones[z], z, nid, 0);
}
@@ -7657,9 +7722,14 @@ static void __init free_area_init_node(int nid)
pgdat->node_start_pfn = start_pfn;
pgdat->per_cpu_nodestats = NULL;
- pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid,
- (u64)start_pfn << PAGE_SHIFT,
- end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0);
+ if (start_pfn != end_pfn) {
+ pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid,
+ (u64)start_pfn << PAGE_SHIFT,
+ end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0);
+ } else {
+ pr_info("Initmem setup node %d as memoryless\n", nid);
+ }
+
calculate_node_totalpages(pgdat, start_pfn, end_pfn);
alloc_node_mem_map(pgdat);
@@ -8096,8 +8166,36 @@ void __init free_area_init(unsigned long *max_zone_pfn)
/* Initialise every node */
mminit_verify_pageflags_layout();
setup_nr_node_ids();
- for_each_online_node(nid) {
- pg_data_t *pgdat = NODE_DATA(nid);
+ for_each_node(nid) {
+ pg_data_t *pgdat;
+
+ if (!node_online(nid)) {
+ pr_info("Initializing node %d as memoryless\n", nid);
+
+ /* Allocator not initialized yet */
+ pgdat = arch_alloc_nodedata(nid);
+ if (!pgdat) {
+ pr_err("Cannot allocate %zuB for node %d.\n",
+ sizeof(*pgdat), nid);
+ continue;
+ }
+ arch_refresh_nodedata(nid, pgdat);
+ free_area_init_memoryless_node(nid);
+
+ /*
+ * We do not want to confuse userspace by sysfs
+ * files/directories for node without any memory
+ * attached to it, so this node is not marked as
+ * N_MEMORY and not marked online so that no sysfs
+ * hierarchy will be created via register_one_node for
+ * it. The pgdat will get fully initialized by
+ * hotadd_init_pgdat() when memory is hotplugged into
+ * this node.
+ */
+ continue;
+ }
+
+ pgdat = NODE_DATA(nid);
free_area_init_node(nid);
/* Any memory on that node */
diff --git a/mm/page_owner.c b/mm/page_owner.c
index 99e360df94652..d56afa9c792ed 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -10,6 +10,7 @@
#include <linux/migrate.h>
#include <linux/stackdepot.h>
#include <linux/seq_file.h>
+#include <linux/memcontrol.h>
#include <linux/sched/clock.h>
#include "internal.h"
@@ -28,6 +29,7 @@ struct page_owner {
depot_stack_handle_t free_handle;
u64 ts_nsec;
u64 free_ts_nsec;
+ char comm[TASK_COMM_LEN];
pid_t pid;
};
@@ -164,6 +166,8 @@ static inline void __set_page_owner_handle(struct page_ext *page_ext,
page_owner->last_migrate_reason = -1;
page_owner->pid = current->pid;
page_owner->ts_nsec = local_clock();
+ strlcpy(page_owner->comm, current->comm,
+ sizeof(page_owner->comm));
__set_bit(PAGE_EXT_OWNER, &page_ext->flags);
__set_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags);
@@ -231,6 +235,7 @@ void __folio_copy_owner(struct folio *newfolio, struct folio *old)
new_page_owner->pid = old_page_owner->pid;
new_page_owner->ts_nsec = old_page_owner->ts_nsec;
new_page_owner->free_ts_nsec = old_page_owner->ts_nsec;
+ strcpy(new_page_owner->comm, old_page_owner->comm);
/*
* We don't clear the bit on the old folio as it's going to be freed
@@ -325,6 +330,45 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m,
seq_putc(m, '\n');
}
+/*
+ * Looking for memcg information and print it out
+ */
+static inline int print_page_owner_memcg(char *kbuf, size_t count, int ret,
+ struct page *page)
+{
+#ifdef CONFIG_MEMCG
+ unsigned long memcg_data;
+ struct mem_cgroup *memcg;
+ bool online;
+ char name[80];
+
+ rcu_read_lock();
+ memcg_data = READ_ONCE(page->memcg_data);
+ if (!memcg_data)
+ goto out_unlock;
+
+ if (memcg_data & MEMCG_DATA_OBJCGS)
+ ret += scnprintf(kbuf + ret, count - ret,
+ "Slab cache page\n");
+
+ memcg = page_memcg_check(page);
+ if (!memcg)
+ goto out_unlock;
+
+ online = (memcg->css.flags & CSS_ONLINE);
+ cgroup_name(memcg->css.cgroup, name, sizeof(name));
+ ret += scnprintf(kbuf + ret, count - ret,
+ "Charged %sto %smemcg %s\n",
+ PageMemcgKmem(page) ? "(via objcg) " : "",
+ online ? "" : "offline ",
+ name);
+out_unlock:
+ rcu_read_unlock();
+#endif /* CONFIG_MEMCG */
+
+ return ret;
+}
+
static ssize_t
print_page_owner(char __user *buf, size_t count, unsigned long pfn,
struct page *page, struct page_owner *page_owner,
@@ -338,19 +382,17 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn,
if (!kbuf)
return -ENOMEM;
- ret = snprintf(kbuf, count,
- "Page allocated via order %u, mask %#x(%pGg), pid %d, ts %llu ns, free_ts %llu ns\n",
+ ret = scnprintf(kbuf, count,
+ "Page allocated via order %u, mask %#x(%pGg), pid %d (%s), ts %llu ns, free_ts %llu ns\n",
page_owner->order, page_owner->gfp_mask,
&page_owner->gfp_mask, page_owner->pid,
- page_owner->ts_nsec, page_owner->free_ts_nsec);
-
- if (ret >= count)
- goto err;
+ page_owner->comm, page_owner->ts_nsec,
+ page_owner->free_ts_nsec);
/* Print information relevant to grouping pages by mobility */
pageblock_mt = get_pageblock_migratetype(page);
page_mt = gfp_migratetype(page_owner->gfp_mask);
- ret += snprintf(kbuf + ret, count - ret,
+ ret += scnprintf(kbuf + ret, count - ret,
"PFN %lu type %s Block %lu type %s Flags %pGp\n",
pfn,
migratetype_names[page_mt],
@@ -358,21 +400,18 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn,
migratetype_names[pageblock_mt],
&page->flags);
- if (ret >= count)
- goto err;
-
ret += stack_depot_snprint(handle, kbuf + ret, count - ret, 0);
if (ret >= count)
goto err;
if (page_owner->last_migrate_reason != -1) {
- ret += snprintf(kbuf + ret, count - ret,
+ ret += scnprintf(kbuf + ret, count - ret,
"Page has been migrated, last migrate reason: %s\n",
migrate_reason_names[page_owner->last_migrate_reason]);
- if (ret >= count)
- goto err;
}
+ ret = print_page_owner_memcg(kbuf, count, ret, page);
+
ret += snprintf(kbuf + ret, count - ret, "\n");
if (ret >= count)
goto err;
@@ -415,9 +454,10 @@ void __dump_page_owner(const struct page *page)
else
pr_alert("page_owner tracks the page as freed\n");
- pr_alert("page last allocated via order %u, migratetype %s, gfp_mask %#x(%pGg), pid %d, ts %llu, free_ts %llu\n",
+ pr_alert("page last allocated via order %u, migratetype %s, gfp_mask %#x(%pGg), pid %d (%s), ts %llu, free_ts %llu\n",
page_owner->order, migratetype_names[mt], gfp_mask, &gfp_mask,
- page_owner->pid, page_owner->ts_nsec, page_owner->free_ts_nsec);
+ page_owner->pid, page_owner->comm, page_owner->ts_nsec,
+ page_owner->free_ts_nsec);
handle = READ_ONCE(page_owner->handle);
if (!handle)
diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c
index 2054c9213c433..f58d73c927892 100644
--- a/mm/percpu-vm.c
+++ b/mm/percpu-vm.c
@@ -84,15 +84,19 @@ static int pcpu_alloc_pages(struct pcpu_chunk *chunk,
gfp_t gfp)
{
unsigned int cpu, tcpu;
- int i;
+ int i, nid;
gfp |= __GFP_HIGHMEM;
for_each_possible_cpu(cpu) {
+ nid = cpu_to_node(cpu);
+ if (nid == NUMA_NO_NODE || !node_online(nid))
+ nid = numa_mem_id();
+
for (i = page_start; i < page_end; i++) {
struct page **pagep = &pages[pcpu_page_idx(cpu, i)];
- *pagep = alloc_pages_node(cpu_to_node(cpu), gfp, 0);
+ *pagep = alloc_pages_node(nid, gfp, 0);
if (!*pagep)
goto err;
}
diff --git a/mm/ptdump.c b/mm/ptdump.c
index da751448d0e4e..eea3d28d173c2 100644
--- a/mm/ptdump.c
+++ b/mm/ptdump.c
@@ -40,8 +40,10 @@ static int ptdump_pgd_entry(pgd_t *pgd, unsigned long addr,
if (st->effective_prot)
st->effective_prot(st, 0, pgd_val(val));
- if (pgd_leaf(val))
+ if (pgd_leaf(val)) {
st->note_page(st, addr, 0, pgd_val(val));
+ walk->action = ACTION_CONTINUE;
+ }
return 0;
}
@@ -61,8 +63,10 @@ static int ptdump_p4d_entry(p4d_t *p4d, unsigned long addr,
if (st->effective_prot)
st->effective_prot(st, 1, p4d_val(val));
- if (p4d_leaf(val))
+ if (p4d_leaf(val)) {
st->note_page(st, addr, 1, p4d_val(val));
+ walk->action = ACTION_CONTINUE;
+ }
return 0;
}
@@ -82,8 +86,10 @@ static int ptdump_pud_entry(pud_t *pud, unsigned long addr,
if (st->effective_prot)
st->effective_prot(st, 2, pud_val(val));
- if (pud_leaf(val))
+ if (pud_leaf(val)) {
st->note_page(st, addr, 2, pud_val(val));
+ walk->action = ACTION_CONTINUE;
+ }
return 0;
}
@@ -101,8 +107,10 @@ static int ptdump_pmd_entry(pmd_t *pmd, unsigned long addr,
if (st->effective_prot)
st->effective_prot(st, 3, pmd_val(val));
- if (pmd_leaf(val))
+ if (pmd_leaf(val)) {
st->note_page(st, addr, 3, pmd_val(val));
+ walk->action = ACTION_CONTINUE;
+ }
return 0;
}
diff --git a/mm/readahead.c b/mm/readahead.c
index cf0dcf89eb69b..feda2b1702f1b 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -595,12 +595,6 @@ void page_cache_async_ra(struct readahead_control *ractl,
folio_clear_readahead(folio);
- /*
- * Defer asynchronous read-ahead on IO congestion.
- */
- if (inode_read_congested(ractl->mapping->host))
- return;
-
if (blk_cgroup_congested())
return;
diff --git a/mm/rmap.c b/mm/rmap.c
index 6a1e8c7f62136..b0356dc1ed424 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -76,7 +76,9 @@
#include <asm/tlbflush.h>
+#define CREATE_TRACE_POINTS
#include <trace/events/tlb.h>
+#include <trace/events/migrate.h>
#include "internal.h"
@@ -89,7 +91,7 @@ static inline struct anon_vma *anon_vma_alloc(void)
anon_vma = kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL);
if (anon_vma) {
- atomic_set(&anon_vma->refcount, 1);
+ refcount_set(&anon_vma->refcount, 1);
anon_vma->degree = 1; /* Reference for first vma */
anon_vma->parent = anon_vma;
/*
@@ -104,7 +106,7 @@ static inline struct anon_vma *anon_vma_alloc(void)
static inline void anon_vma_free(struct anon_vma *anon_vma)
{
- VM_BUG_ON(atomic_read(&anon_vma->refcount));
+ VM_BUG_ON(refcount_read(&anon_vma->refcount));
/*
* Synchronize against page_lock_anon_vma_read() such that
@@ -446,7 +448,7 @@ static void anon_vma_ctor(void *data)
struct anon_vma *anon_vma = data;
init_rwsem(&anon_vma->rwsem);
- atomic_set(&anon_vma->refcount, 0);
+ refcount_set(&anon_vma->refcount, 0);
anon_vma->rb_root = RB_ROOT_CACHED;
}
@@ -496,7 +498,7 @@ struct anon_vma *page_get_anon_vma(struct page *page)
goto out;
anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
- if (!atomic_inc_not_zero(&anon_vma->refcount)) {
+ if (!refcount_inc_not_zero(&anon_vma->refcount)) {
anon_vma = NULL;
goto out;
}
@@ -555,7 +557,7 @@ struct anon_vma *page_lock_anon_vma_read(struct page *page)
}
/* trylock failed, we got to sleep */
- if (!atomic_inc_not_zero(&anon_vma->refcount)) {
+ if (!refcount_inc_not_zero(&anon_vma->refcount)) {
anon_vma = NULL;
goto out;
}
@@ -570,7 +572,7 @@ struct anon_vma *page_lock_anon_vma_read(struct page *page)
rcu_read_unlock();
anon_vma_lock_read(anon_vma);
- if (atomic_dec_and_test(&anon_vma->refcount)) {
+ if (refcount_dec_and_test(&anon_vma->refcount)) {
/*
* Oops, we held the last refcount, release the lock
* and bail -- can't simply use put_anon_vma() because
@@ -812,7 +814,10 @@ static bool page_referenced_one(struct page *page, struct vm_area_struct *vma,
while (page_vma_mapped_walk(&pvmw)) {
address = pvmw.address;
- if (vma->vm_flags & VM_LOCKED) {
+ if ((vma->vm_flags & VM_LOCKED) &&
+ (!PageTransCompound(page) || !pvmw.pte)) {
+ /* Restore the mlock which got missed */
+ mlock_vma_page(page, vma, !pvmw.pte);
page_vma_mapped_walk_done(&pvmw);
pra->vm_flags |= VM_LOCKED;
return false; /* To break the loop */
@@ -851,7 +856,7 @@ static bool page_referenced_one(struct page *page, struct vm_area_struct *vma,
if (referenced) {
pra->referenced++;
- pra->vm_flags |= vma->vm_flags;
+ pra->vm_flags |= vma->vm_flags & ~VM_LOCKED;
}
if (!pra->mapcount)
@@ -1181,17 +1186,17 @@ void do_page_add_anon_rmap(struct page *page,
__mod_lruvec_page_state(page, NR_ANON_MAPPED, nr);
}
- if (unlikely(PageKsm(page))) {
+ if (unlikely(PageKsm(page)))
unlock_page_memcg(page);
- return;
- }
/* address might be in next vma when migration races vma_adjust */
- if (first)
+ else if (first)
__page_set_anon_rmap(page, vma, address,
flags & RMAP_EXCLUSIVE);
else
__page_check_anon_rmap(page, vma, address);
+
+ mlock_vma_page(page, vma, compound);
}
/**
@@ -1232,12 +1237,14 @@ void page_add_new_anon_rmap(struct page *page,
/**
* page_add_file_rmap - add pte mapping to a file page
- * @page: the page to add the mapping to
- * @compound: charge the page as compound or small page
+ * @page: the page to add the mapping to
+ * @vma: the vm area in which the mapping is added
+ * @compound: charge the page as compound or small page
*
* The caller needs to hold the pte lock.
*/
-void page_add_file_rmap(struct page *page, bool compound)
+void page_add_file_rmap(struct page *page,
+ struct vm_area_struct *vma, bool compound)
{
int i, nr = 1;
@@ -1260,13 +1267,8 @@ void page_add_file_rmap(struct page *page, bool compound)
nr_pages);
} else {
if (PageTransCompound(page) && page_mapping(page)) {
- struct page *head = compound_head(page);
-
VM_WARN_ON_ONCE(!PageLocked(page));
-
- SetPageDoubleMap(head);
- if (PageMlocked(page))
- clear_page_mlock(head);
+ SetPageDoubleMap(compound_head(page));
}
if (!atomic_inc_and_test(&page->_mapcount))
goto out;
@@ -1274,6 +1276,8 @@ void page_add_file_rmap(struct page *page, bool compound)
__mod_lruvec_page_state(page, NR_FILE_MAPPED, nr);
out:
unlock_page_memcg(page);
+
+ mlock_vma_page(page, vma, compound);
}
static void page_remove_file_rmap(struct page *page, bool compound)
@@ -1316,9 +1320,6 @@ static void page_remove_file_rmap(struct page *page, bool compound)
* pte lock(a spinlock) is held, which implies preemption disabled.
*/
__mod_lruvec_page_state(page, NR_FILE_MAPPED, -nr);
-
- if (unlikely(PageMlocked(page)))
- clear_page_mlock(page);
}
static void page_remove_anon_compound_rmap(struct page *page)
@@ -1358,9 +1359,6 @@ static void page_remove_anon_compound_rmap(struct page *page)
nr = thp_nr_pages(page);
}
- if (unlikely(PageMlocked(page)))
- clear_page_mlock(page);
-
if (nr)
__mod_lruvec_page_state(page, NR_ANON_MAPPED, -nr);
}
@@ -1368,11 +1366,13 @@ static void page_remove_anon_compound_rmap(struct page *page)
/**
* page_remove_rmap - take down pte mapping from a page
* @page: page to remove mapping from
+ * @vma: the vm area from which the mapping is removed
* @compound: uncharge the page as compound or small page
*
* The caller needs to hold the pte lock.
*/
-void page_remove_rmap(struct page *page, bool compound)
+void page_remove_rmap(struct page *page,
+ struct vm_area_struct *vma, bool compound)
{
lock_page_memcg(page);
@@ -1397,9 +1397,6 @@ void page_remove_rmap(struct page *page, bool compound)
*/
__dec_lruvec_page_state(page, NR_ANON_MAPPED);
- if (unlikely(PageMlocked(page)))
- clear_page_mlock(page);
-
if (PageTransCompound(page))
deferred_split_huge_page(compound_head(page));
@@ -1414,6 +1411,8 @@ void page_remove_rmap(struct page *page, bool compound)
*/
out:
unlock_page_memcg(page);
+
+ munlock_vma_page(page, vma, compound);
}
/*
@@ -1469,28 +1468,21 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
mmu_notifier_invalidate_range_start(&range);
while (page_vma_mapped_walk(&pvmw)) {
+ /* Unexpected PMD-mapped THP? */
+ VM_BUG_ON_PAGE(!pvmw.pte, page);
+
/*
- * If the page is mlock()d, we cannot swap it out.
+ * If the page is in an mlock()d vma, we must not swap it out.
*/
if (!(flags & TTU_IGNORE_MLOCK) &&
(vma->vm_flags & VM_LOCKED)) {
- /*
- * PTE-mapped THP are never marked as mlocked: so do
- * not set it on a DoubleMap THP, nor on an Anon THP
- * (which may still be PTE-mapped after DoubleMap was
- * cleared). But stop unmapping even in those cases.
- */
- if (!PageTransCompound(page) || (PageHead(page) &&
- !PageDoubleMap(page) && !PageAnon(page)))
- mlock_vma_page(page);
+ /* Restore the mlock which got missed */
+ mlock_vma_page(page, vma, false);
page_vma_mapped_walk_done(&pvmw);
ret = false;
break;
}
- /* Unexpected PMD-mapped THP? */
- VM_BUG_ON_PAGE(!pvmw.pte, page);
-
subpage = page - page_to_pfn(page) + pte_pfn(*pvmw.pte);
address = pvmw.address;
@@ -1553,7 +1545,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
/* Update high watermark before we lower rss */
update_hiwater_rss(mm);
- if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) {
+ if (PageHWPoison(subpage) && !(flags & TTU_IGNORE_HWPOISON)) {
pteval = swp_entry_to_pte(make_hwpoison_entry(subpage));
if (PageHuge(page)) {
hugetlb_count_sub(compound_nr(page), mm);
@@ -1599,7 +1591,30 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
/* MADV_FREE page check */
if (!PageSwapBacked(page)) {
- if (!PageDirty(page)) {
+ int ref_count, map_count;
+
+ /*
+ * Synchronize with gup_pte_range():
+ * - clear PTE; barrier; read refcount
+ * - inc refcount; barrier; read PTE
+ */
+ smp_mb();
+
+ ref_count = page_count(page);
+ map_count = page_mapcount(page);
+
+ /*
+ * Order reads for page refcount and dirty flag;
+ * see __remove_mapping().
+ */
+ smp_rmb();
+
+ /*
+ * The only page refs must be from the isolation
+ * plus one or more rmap's (dropped by discard:).
+ */
+ if ((ref_count == 1 + map_count) &&
+ !PageDirty(page)) {
/* Invalidate as we cleared the pte */
mmu_notifier_invalidate_range(mm,
address, address + PAGE_SIZE);
@@ -1668,7 +1683,9 @@ discard:
*
* See Documentation/vm/mmu_notifier.rst
*/
- page_remove_rmap(subpage, PageHuge(page));
+ page_remove_rmap(subpage, vma, PageHuge(page));
+ if (vma->vm_flags & VM_LOCKED)
+ mlock_page_drain(smp_processor_id());
put_page(page);
}
@@ -1835,7 +1852,7 @@ static bool try_to_migrate_one(struct page *page, struct vm_area_struct *vma,
/* Update high watermark before we lower rss */
update_hiwater_rss(mm);
- if (is_zone_device_page(page)) {
+ if (is_device_private_page(page)) {
unsigned long pfn = page_to_pfn(page);
swp_entry_t entry;
pte_t swp_pte;
@@ -1861,6 +1878,8 @@ static bool try_to_migrate_one(struct page *page, struct vm_area_struct *vma,
if (pte_swp_uffd_wp(pteval))
swp_pte = pte_swp_mkuffd_wp(swp_pte);
set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte);
+ trace_set_migration_pte(pvmw.address, pte_val(swp_pte),
+ compound_order(page));
/*
* No need to invalidate here it will synchronize on
* against the special swap migration pte.
@@ -1873,7 +1892,7 @@ static bool try_to_migrate_one(struct page *page, struct vm_area_struct *vma,
* memory are supported.
*/
subpage = page;
- } else if (PageHWPoison(page)) {
+ } else if (PageHWPoison(subpage)) {
pteval = swp_entry_to_pte(make_hwpoison_entry(subpage));
if (PageHuge(page)) {
hugetlb_count_sub(compound_nr(page), mm);
@@ -1929,6 +1948,8 @@ static bool try_to_migrate_one(struct page *page, struct vm_area_struct *vma,
if (pte_uffd_wp(pteval))
swp_pte = pte_swp_mkuffd_wp(swp_pte);
set_pte_at(mm, address, pvmw.pte, swp_pte);
+ trace_set_migration_pte(address, pte_val(swp_pte),
+ compound_order(page));
/*
* No need to invalidate here it will synchronize on
* against the special swap migration pte.
@@ -1942,7 +1963,9 @@ static bool try_to_migrate_one(struct page *page, struct vm_area_struct *vma,
*
* See Documentation/vm/mmu_notifier.rst
*/
- page_remove_rmap(subpage, PageHuge(page));
+ page_remove_rmap(subpage, vma, PageHuge(page));
+ if (vma->vm_flags & VM_LOCKED)
+ mlock_page_drain(smp_processor_id());
put_page(page);
}
@@ -1976,7 +1999,8 @@ void try_to_migrate(struct page *page, enum ttu_flags flags)
TTU_SYNC)))
return;
- if (is_zone_device_page(page) && !is_device_private_page(page))
+ if (is_zone_device_page(page) &&
+ !is_dev_private_or_coherent_page(page))
return;
/*
@@ -1996,76 +2020,6 @@ void try_to_migrate(struct page *page, enum ttu_flags flags)
rmap_walk(page, &rwc);
}
-/*
- * Walks the vma's mapping a page and mlocks the page if any locked vma's are
- * found. Once one is found the page is locked and the scan can be terminated.
- */
-static bool page_mlock_one(struct page *page, struct vm_area_struct *vma,
- unsigned long address, void *unused)
-{
- struct page_vma_mapped_walk pvmw = {
- .page = page,
- .vma = vma,
- .address = address,
- };
-
- /* An un-locked vma doesn't have any pages to lock, continue the scan */
- if (!(vma->vm_flags & VM_LOCKED))
- return true;
-
- while (page_vma_mapped_walk(&pvmw)) {
- /*
- * Need to recheck under the ptl to serialise with
- * __munlock_pagevec_fill() after VM_LOCKED is cleared in
- * munlock_vma_pages_range().
- */
- if (vma->vm_flags & VM_LOCKED) {
- /*
- * PTE-mapped THP are never marked as mlocked; but
- * this function is never called on a DoubleMap THP,
- * nor on an Anon THP (which may still be PTE-mapped
- * after DoubleMap was cleared).
- */
- mlock_vma_page(page);
- /*
- * No need to scan further once the page is marked
- * as mlocked.
- */
- page_vma_mapped_walk_done(&pvmw);
- return false;
- }
- }
-
- return true;
-}
-
-/**
- * page_mlock - try to mlock a page
- * @page: the page to be mlocked
- *
- * Called from munlock code. Checks all of the VMAs mapping the page and mlocks
- * the page if any are found. The page will be returned with PG_mlocked cleared
- * if it is not mapped by any locked vmas.
- */
-void page_mlock(struct page *page)
-{
- struct rmap_walk_control rwc = {
- .rmap_one = page_mlock_one,
- .done = page_not_mapped,
- .anon_lock = page_lock_anon_vma_read,
-
- };
-
- VM_BUG_ON_PAGE(!PageLocked(page) || PageLRU(page), page);
- VM_BUG_ON_PAGE(PageCompound(page) && PageDoubleMap(page), page);
-
- /* Anon THP are only marked as mlocked when singly mapped */
- if (PageTransCompound(page) && PageAnon(page))
- return;
-
- rmap_walk(page, &rwc);
-}
-
#ifdef CONFIG_DEVICE_PRIVATE
struct make_exclusive_args {
struct mm_struct *mm;
@@ -2148,7 +2102,7 @@ static bool page_make_device_exclusive_one(struct page *page,
* There is a reference on the page for the swap entry which has
* been removed, so shouldn't take another.
*/
- page_remove_rmap(subpage, false);
+ page_remove_rmap(subpage, vma, false);
}
mmu_notifier_invalidate_range_end(&range);
@@ -2257,7 +2211,7 @@ void __put_anon_vma(struct anon_vma *anon_vma)
struct anon_vma *root = anon_vma->root;
anon_vma_free(anon_vma);
- if (root != anon_vma && atomic_dec_and_test(&root->refcount))
+ if (root != anon_vma && refcount_dec_and_test(&root->refcount))
anon_vma_free(root);
}
@@ -2291,11 +2245,6 @@ static struct anon_vma *rmap_walk_anon_lock(struct page *page,
*
* Find all the mappings of a page using the mapping pointer and the vma chains
* contained in the anon_vma struct it points to.
- *
- * When called from page_mlock(), the mmap_lock of the mm containing the vma
- * where the page was found will be held for write. So, we won't recheck
- * vm_flags for that VMA. That should be OK, because that vma shouldn't be
- * LOCKED.
*/
static void rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc,
bool locked)
@@ -2344,11 +2293,6 @@ static void rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc,
*
* Find all the mappings of a page using the mapping pointer and the vma chains
* contained in the address_space struct it points to.
- *
- * When called from page_mlock(), the mmap_lock of the mm containing the vma
- * where the page was found will be held for write. So, we won't recheck
- * vm_flags for that VMA. That should be OK, because that vma shouldn't be
- * LOCKED.
*/
static void rmap_walk_file(struct page *page, struct rmap_walk_control *rwc,
bool locked)
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index db6df27c852a7..8aecd6b3896c7 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -34,6 +34,7 @@
#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
+#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
/**
* struct vmemmap_remap_walk - walk vmemmap page table
*
@@ -53,8 +54,7 @@ struct vmemmap_remap_walk {
struct list_head *vmemmap_pages;
};
-static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start,
- struct vmemmap_remap_walk *walk)
+static int __split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start)
{
pmd_t __pmd;
int i;
@@ -76,15 +76,34 @@ static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start,
set_pte_at(&init_mm, addr, pte, entry);
}
- /* Make pte visible before pmd. See comment in pmd_install(). */
- smp_wmb();
- pmd_populate_kernel(&init_mm, pmd, pgtable);
-
- flush_tlb_kernel_range(start, start + PMD_SIZE);
+ spin_lock(&init_mm.page_table_lock);
+ if (likely(pmd_leaf(*pmd))) {
+ /* Make pte visible before pmd. See comment in pmd_install(). */
+ smp_wmb();
+ pmd_populate_kernel(&init_mm, pmd, pgtable);
+ flush_tlb_kernel_range(start, start + PMD_SIZE);
+ } else {
+ pte_free_kernel(&init_mm, pgtable);
+ }
+ spin_unlock(&init_mm.page_table_lock);
return 0;
}
+static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start)
+{
+ int leaf;
+
+ spin_lock(&init_mm.page_table_lock);
+ leaf = pmd_leaf(*pmd);
+ spin_unlock(&init_mm.page_table_lock);
+
+ if (!leaf)
+ return 0;
+
+ return __split_vmemmap_huge_pmd(pmd, start);
+}
+
static void vmemmap_pte_range(pmd_t *pmd, unsigned long addr,
unsigned long end,
struct vmemmap_remap_walk *walk)
@@ -121,13 +140,12 @@ static int vmemmap_pmd_range(pud_t *pud, unsigned long addr,
pmd = pmd_offset(pud, addr);
do {
- if (pmd_leaf(*pmd)) {
- int ret;
+ int ret;
+
+ ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK);
+ if (ret)
+ return ret;
- ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK, walk);
- if (ret)
- return ret;
- }
next = pmd_addr_end(addr, end);
vmemmap_pte_range(pmd, addr, next, walk);
} while (pmd++, addr = next, addr != end);
@@ -245,6 +263,26 @@ static void vmemmap_remap_pte(pte_t *pte, unsigned long addr,
set_pte_at(&init_mm, addr, pte, entry);
}
+/*
+ * How many struct page structs need to be reset. When we reuse the head
+ * struct page, the special metadata (e.g. page->flags or page->mapping)
+ * cannot copy to the tail struct page structs. The invalid value will be
+ * checked in the free_tail_pages_check(). In order to avoid the message
+ * of "corrupted mapping in tail page". We need to reset at least 3 (one
+ * head struct page struct and two tail struct page structs) struct page
+ * structs.
+ */
+#define NR_RESET_STRUCT_PAGE 3
+
+static inline void reset_struct_pages(struct page *start)
+{
+ int i;
+ struct page *from = start + NR_RESET_STRUCT_PAGE;
+
+ for (i = 0; i < NR_RESET_STRUCT_PAGE; i++)
+ memcpy(start + i, from, sizeof(*from));
+}
+
static void vmemmap_restore_pte(pte_t *pte, unsigned long addr,
struct vmemmap_remap_walk *walk)
{
@@ -258,6 +296,7 @@ static void vmemmap_restore_pte(pte_t *pte, unsigned long addr,
list_del(&page->lru);
to = page_to_virt(page);
copy_page(to, (void *)walk->reuse_addr);
+ reset_struct_pages(to);
set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot));
}
@@ -300,10 +339,8 @@ int vmemmap_remap_free(unsigned long start, unsigned long end,
*/
BUG_ON(start - reuse != PAGE_SIZE);
- mmap_write_lock(&init_mm);
+ mmap_read_lock(&init_mm);
ret = vmemmap_remap_range(reuse, end, &walk);
- mmap_write_downgrade(&init_mm);
-
if (ret && walk.nr_walked) {
end = reuse + walk.nr_walked * PAGE_SIZE;
/*
@@ -383,6 +420,7 @@ int vmemmap_remap_alloc(unsigned long start, unsigned long end,
return 0;
}
+#endif /* CONFIG_HUGETLB_PAGE_FREE_VMEMMAP */
/*
* Allocate a block of memory to be used to back the virtual memory map
diff --git a/mm/sparse.c b/mm/sparse.c
index d21c6e5910d07..952f06d8f3731 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -126,7 +126,7 @@ static inline int sparse_early_nid(struct mem_section *section)
}
/* Validate the physical addressing limitations of the model */
-void __meminit mminit_validate_memmodel_limits(unsigned long *start_pfn,
+static void __meminit mminit_validate_memmodel_limits(unsigned long *start_pfn,
unsigned long *end_pfn)
{
unsigned long max_sparsemem_pfn = 1UL << (MAX_PHYSMEM_BITS-PAGE_SHIFT);
diff --git a/mm/swap.c b/mm/swap.c
index bcf3ac288b56d..842d5cd92cf64 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -74,8 +74,8 @@ static DEFINE_PER_CPU(struct lru_pvecs, lru_pvecs) = {
};
/*
- * This path almost never happens for VM activity - pages are normally
- * freed via pagevecs. But it gets used by networking.
+ * This path almost never happens for VM activity - pages are normally freed
+ * via pagevecs. But it gets used by networking - and for compound pages.
*/
static void __page_cache_release(struct page *page)
{
@@ -89,6 +89,14 @@ static void __page_cache_release(struct page *page)
__clear_page_lru_flags(page);
unlock_page_lruvec_irqrestore(lruvec, flags);
}
+ /* See comment on PageMlocked in release_pages() */
+ if (unlikely(PageMlocked(page))) {
+ int nr_pages = thp_nr_pages(page);
+
+ __ClearPageMlocked(page);
+ mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);
+ count_vm_events(UNEVICTABLE_PGCLEARED, nr_pages);
+ }
__ClearPageWaiters(page);
}
@@ -482,22 +490,12 @@ EXPORT_SYMBOL(folio_add_lru);
void lru_cache_add_inactive_or_unevictable(struct page *page,
struct vm_area_struct *vma)
{
- bool unevictable;
-
VM_BUG_ON_PAGE(PageLRU(page), page);
- unevictable = (vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) == VM_LOCKED;
- if (unlikely(unevictable) && !TestSetPageMlocked(page)) {
- int nr_pages = thp_nr_pages(page);
- /*
- * We use the irq-unsafe __mod_zone_page_state because this
- * counter is not modified from interrupt context, and the pte
- * lock is held(spinlock), which implies preemption disabled.
- */
- __mod_zone_page_state(page_zone(page), NR_MLOCK, nr_pages);
- count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages);
- }
- lru_cache_add(page);
+ if (unlikely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) == VM_LOCKED))
+ mlock_new_page(page);
+ else
+ lru_cache_add(page);
}
/*
@@ -636,6 +634,7 @@ void lru_add_drain_cpu(int cpu)
pagevec_lru_move_fn(pvec, lru_lazyfree_fn);
activate_page_drain(cpu);
+ mlock_page_drain(cpu);
}
/**
@@ -838,6 +837,7 @@ inline void __lru_add_drain_all(bool force_all_cpus)
pagevec_count(&per_cpu(lru_pvecs.lru_deactivate, cpu)) ||
pagevec_count(&per_cpu(lru_pvecs.lru_lazyfree, cpu)) ||
need_activate_page_drain(cpu) ||
+ need_mlock_page_drain(cpu) ||
has_bh_in_lru(cpu, NULL)) {
INIT_WORK(work, lru_add_drain_per_cpu);
queue_work_on(cpu, mm_percpu_wq, work);
@@ -969,6 +969,18 @@ void release_pages(struct page **pages, int nr)
__clear_page_lru_flags(page);
}
+ /*
+ * In rare cases, when truncation or holepunching raced with
+ * munlock after VM_LOCKED was cleared, Mlocked may still be
+ * found set here. This does not indicate a problem, unless
+ * "unevictable_pgs_cleared" appears worryingly large.
+ */
+ if (unlikely(PageMlocked(page))) {
+ __ClearPageMlocked(page);
+ dec_zone_page_state(page, NR_MLOCK);
+ count_vm_event(UNEVICTABLE_PGCLEARED);
+ }
+
__ClearPageWaiters(page);
list_add(&page->lru, &pages_to_free);
@@ -1009,43 +1021,32 @@ static void __pagevec_lru_add_fn(struct folio *folio, struct lruvec *lruvec)
VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
+ folio_set_lru(folio);
/*
- * A folio becomes evictable in two ways:
- * 1) Within LRU lock [munlock_vma_page() and __munlock_pagevec()].
- * 2) Before acquiring LRU lock to put the folio on the correct LRU
- * and then
- * a) do PageLRU check with lock [check_move_unevictable_pages]
- * b) do PageLRU check before lock [clear_page_mlock]
- *
- * (1) & (2a) are ok as LRU lock will serialize them. For (2b), we need
- * following strict ordering:
- *
- * #0: __pagevec_lru_add_fn #1: clear_page_mlock
- *
- * folio_set_lru() folio_test_clear_mlocked()
- * smp_mb() // explicit ordering // above provides strict
- * // ordering
- * folio_test_mlocked() folio_test_lru()
+ * Is an smp_mb__after_atomic() still required here, before
+ * folio_evictable() tests PageMlocked, to rule out the possibility
+ * of stranding an evictable folio on an unevictable LRU? I think
+ * not, because __munlock_page() only clears PageMlocked while the LRU
+ * lock is held.
*
- *
- * if '#1' does not observe setting of PG_lru by '#0' and
- * fails isolation, the explicit barrier will make sure that
- * folio_evictable check will put the folio on the correct
- * LRU. Without smp_mb(), folio_set_lru() can be reordered
- * after folio_test_mlocked() check and can make '#1' fail the
- * isolation of the folio whose mlocked bit is cleared (#0 is
- * also looking at the same folio) and the evictable folio will
- * be stranded on an unevictable LRU.
+ * (That is not true of __page_cache_release(), and not necessarily
+ * true of release_pages(): but those only clear PageMlocked after
+ * put_page_testzero() has excluded any other users of the page.)
*/
- folio_set_lru(folio);
- smp_mb__after_atomic();
-
if (folio_evictable(folio)) {
if (was_unevictable)
__count_vm_events(UNEVICTABLE_PGRESCUED, nr_pages);
} else {
folio_clear_active(folio);
folio_set_unevictable(folio);
+ /*
+ * folio->mlock_count = !!folio_test_mlocked(folio)?
+ * But that leaves __mlock_page() in doubt whether another
+ * actor has already counted the mlock or not. Err on the
+ * safe side, underestimate, let page reclaim fix it, rather
+ * than leaving a page on the unevictable LRU indefinitely.
+ */
+ folio->mlock_count = 0;
if (!was_unevictable)
__count_vm_events(UNEVICTABLE_PGCULLED, nr_pages);
}
diff --git a/mm/swapfile.c b/mm/swapfile.c
index bf0df7aa7158f..a5183315dc585 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1167,16 +1167,6 @@ out:
return NULL;
}
-static struct swap_info_struct *swap_info_get(swp_entry_t entry)
-{
- struct swap_info_struct *p;
-
- p = _swap_info_get(entry);
- if (p)
- spin_lock(&p->lock);
- return p;
-}
-
static struct swap_info_struct *swap_info_get_cont(swp_entry_t entry,
struct swap_info_struct *q)
{
@@ -1601,100 +1591,6 @@ static bool page_swapped(struct page *page)
return false;
}
-static int page_trans_huge_map_swapcount(struct page *page,
- int *total_swapcount)
-{
- int i, map_swapcount, _total_swapcount;
- unsigned long offset = 0;
- struct swap_info_struct *si;
- struct swap_cluster_info *ci = NULL;
- unsigned char *map = NULL;
- int swapcount = 0;
-
- /* hugetlbfs shouldn't call it */
- VM_BUG_ON_PAGE(PageHuge(page), page);
-
- if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!PageTransCompound(page))) {
- if (PageSwapCache(page))
- swapcount = page_swapcount(page);
- if (total_swapcount)
- *total_swapcount = swapcount;
- return swapcount + page_trans_huge_mapcount(page);
- }
-
- page = compound_head(page);
-
- _total_swapcount = map_swapcount = 0;
- if (PageSwapCache(page)) {
- swp_entry_t entry;
-
- entry.val = page_private(page);
- si = _swap_info_get(entry);
- if (si) {
- map = si->swap_map;
- offset = swp_offset(entry);
- }
- }
- if (map)
- ci = lock_cluster(si, offset);
- for (i = 0; i < HPAGE_PMD_NR; i++) {
- int mapcount = atomic_read(&page[i]._mapcount) + 1;
- if (map) {
- swapcount = swap_count(map[offset + i]);
- _total_swapcount += swapcount;
- }
- map_swapcount = max(map_swapcount, mapcount + swapcount);
- }
- unlock_cluster(ci);
-
- if (PageDoubleMap(page))
- map_swapcount -= 1;
-
- if (total_swapcount)
- *total_swapcount = _total_swapcount;
-
- return map_swapcount + compound_mapcount(page);
-}
-
-/*
- * We can write to an anon page without COW if there are no other references
- * to it. And as a side-effect, free up its swap: because the old content
- * on disk will never be read, and seeking back there to write new content
- * later would only waste time away from clustering.
- */
-bool reuse_swap_page(struct page *page)
-{
- int count, total_swapcount;
-
- VM_BUG_ON_PAGE(!PageLocked(page), page);
- if (unlikely(PageKsm(page)))
- return false;
- count = page_trans_huge_map_swapcount(page, &total_swapcount);
- if (count == 1 && PageSwapCache(page) &&
- (likely(!PageTransCompound(page)) ||
- /* The remaining swap count will be freed soon */
- total_swapcount == page_swapcount(page))) {
- if (!PageWriteback(page)) {
- page = compound_head(page);
- delete_from_swap_cache(page);
- SetPageDirty(page);
- } else {
- swp_entry_t entry;
- struct swap_info_struct *p;
-
- entry.val = page_private(page);
- p = swap_info_get(entry);
- if (p->flags & SWP_STABLE_WRITES) {
- spin_unlock(&p->lock);
- return false;
- }
- spin_unlock(&p->lock);
- }
- }
-
- return count <= 1;
-}
-
/*
* If swap is getting full, or if there are no more mappings of this page,
* then try_to_free_swap is called to free its swap space.
diff --git a/mm/usercopy.c b/mm/usercopy.c
index d0d268135d96d..e7b0cb49daa1b 100644
--- a/mm/usercopy.c
+++ b/mm/usercopy.c
@@ -70,17 +70,6 @@ static noinline int check_stack_object(const void *obj, unsigned long len)
* kmem_cache_create_usercopy() function to create the cache (and
* carefully audit the whitelist range).
*/
-void usercopy_warn(const char *name, const char *detail, bool to_user,
- unsigned long offset, unsigned long len)
-{
- WARN_ONCE(1, "Bad or missing usercopy whitelist? Kernel memory %s attempt detected %s %s%s%s%s (offset %lu, size %lu)!\n",
- to_user ? "exposure" : "overwrite",
- to_user ? "from" : "to",
- name ? : "unknown?!",
- detail ? " '" : "", detail ? : "", detail ? "'" : "",
- offset, len);
-}
-
void __noreturn usercopy_abort(const char *name, const char *detail,
bool to_user, unsigned long offset,
unsigned long len)
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 0780c2a57ff11..15d3e97a6e045 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -95,10 +95,15 @@ int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
if (!pte_none(*dst_pte))
goto out_unlock;
- if (page_in_cache)
- page_add_file_rmap(page, false);
- else
+ if (page_in_cache) {
+ /* Usually, cache pages are already added to LRU */
+ if (newly_allocated)
+ lru_cache_add(page);
+ page_add_file_rmap(page, dst_vma, false);
+ } else {
page_add_new_anon_rmap(page, dst_vma, dst_addr, false);
+ lru_cache_add_inactive_or_unevictable(page, dst_vma);
+ }
/*
* Must happen after rmap, as mm_counter() checks mapping (via
@@ -106,9 +111,6 @@ int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
*/
inc_mm_counter(dst_mm, mm_counter(page));
- if (newly_allocated)
- lru_cache_add_inactive_or_unevictable(page, dst_vma);
-
set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
/* No need to invalidate - it was non-present before */
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 4165304d35471..b454cf1a261f9 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -74,7 +74,7 @@ static const bool vmap_allow_huge = false;
bool is_vmalloc_addr(const void *x)
{
- unsigned long addr = (unsigned long)x;
+ unsigned long addr = (unsigned long)kasan_reset_tag(x);
return addr >= VMALLOC_START && addr < VMALLOC_END;
}
@@ -118,7 +118,6 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
if (size != PAGE_SIZE) {
pte_t entry = pfn_pte(pfn, prot);
- entry = pte_mkhuge(entry);
entry = arch_make_huge_pte(entry, ilog2(size), 0);
set_huge_pte_at(&init_mm, addr, pte, entry);
pfn += PFN_DOWN(size);
@@ -479,6 +478,8 @@ static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr,
return -EBUSY;
if (WARN_ON(!page))
return -ENOMEM;
+ if (WARN_ON(!pfn_valid(page_to_pfn(page))))
+ return -EINVAL;
set_pte_at(&init_mm, addr, pte, mk_pte(page, prot));
(*nr)++;
} while (pte++, addr += PAGE_SIZE, addr != end);
@@ -632,7 +633,7 @@ int is_vmalloc_or_module_addr(const void *x)
* just put it in the vmalloc space.
*/
#if defined(CONFIG_MODULES) && defined(MODULES_VADDR)
- unsigned long addr = (unsigned long)x;
+ unsigned long addr = (unsigned long)kasan_reset_tag(x);
if (addr >= MODULES_VADDR && addr < MODULES_END)
return 1;
#endif
@@ -776,23 +777,13 @@ get_subtree_max_size(struct rb_node *node)
return va ? va->subtree_max_size : 0;
}
-/*
- * Gets called when remove the node and rotate.
- */
-static __always_inline unsigned long
-compute_subtree_max_size(struct vmap_area *va)
-{
- return max3(va_size(va),
- get_subtree_max_size(va->rb_node.rb_left),
- get_subtree_max_size(va->rb_node.rb_right));
-}
-
RB_DECLARE_CALLBACKS_MAX(static, free_vmap_area_rb_augment_cb,
struct vmap_area, rb_node, unsigned long, subtree_max_size, va_size)
static void purge_vmap_area_lazy(void);
static BLOCKING_NOTIFIER_HEAD(vmap_notify_list);
-static unsigned long lazy_max_pages(void);
+static void drain_vmap_area_work(struct work_struct *work);
+static DECLARE_WORK(drain_vmap_work, drain_vmap_area_work);
static atomic_long_t nr_vmalloc_pages;
@@ -806,6 +797,8 @@ static struct vmap_area *find_vmap_area_exceed_addr(unsigned long addr)
struct vmap_area *va = NULL;
struct rb_node *n = vmap_area_root.rb_node;
+ addr = (unsigned long)kasan_reset_tag((void *)addr);
+
while (n) {
struct vmap_area *tmp;
@@ -827,6 +820,8 @@ static struct vmap_area *__find_vmap_area(unsigned long addr)
{
struct rb_node *n = vmap_area_root.rb_node;
+ addr = (unsigned long)kasan_reset_tag((void *)addr);
+
while (n) {
struct vmap_area *va;
@@ -973,6 +968,17 @@ unlink_va(struct vmap_area *va, struct rb_root *root)
}
#if DEBUG_AUGMENT_PROPAGATE_CHECK
+/*
+ * Gets called when remove the node and rotate.
+ */
+static __always_inline unsigned long
+compute_subtree_max_size(struct vmap_area *va)
+{
+ return max3(va_size(va),
+ get_subtree_max_size(va->rb_node.rb_left),
+ get_subtree_max_size(va->rb_node.rb_right));
+}
+
static void
augment_tree_propagate_check(void)
{
@@ -1189,22 +1195,28 @@ is_within_this_va(struct vmap_area *va, unsigned long size,
/*
* Find the first free block(lowest start address) in the tree,
* that will accomplish the request corresponding to passing
- * parameters.
+ * parameters. Please note, with an alignment bigger than PAGE_SIZE,
+ * a search length is adjusted to account for worst case alignment
+ * overhead.
*/
static __always_inline struct vmap_area *
-find_vmap_lowest_match(unsigned long size,
- unsigned long align, unsigned long vstart)
+find_vmap_lowest_match(unsigned long size, unsigned long align,
+ unsigned long vstart, bool adjust_search_size)
{
struct vmap_area *va;
struct rb_node *node;
+ unsigned long length;
/* Start from the root. */
node = free_vmap_area_root.rb_node;
+ /* Adjust the search size for alignment overhead. */
+ length = adjust_search_size ? size + align - 1 : size;
+
while (node) {
va = rb_entry(node, struct vmap_area, rb_node);
- if (get_subtree_max_size(node->rb_left) >= size &&
+ if (get_subtree_max_size(node->rb_left) >= length &&
vstart < va->va_start) {
node = node->rb_left;
} else {
@@ -1214,9 +1226,9 @@ find_vmap_lowest_match(unsigned long size,
/*
* Does not make sense to go deeper towards the right
* sub-tree if it does not have a free block that is
- * equal or bigger to the requested search size.
+ * equal or bigger to the requested search length.
*/
- if (get_subtree_max_size(node->rb_right) >= size) {
+ if (get_subtree_max_size(node->rb_right) >= length) {
node = node->rb_right;
continue;
}
@@ -1232,7 +1244,7 @@ find_vmap_lowest_match(unsigned long size,
if (is_within_this_va(va, size, align, vstart))
return va;
- if (get_subtree_max_size(node->rb_right) >= size &&
+ if (get_subtree_max_size(node->rb_right) >= length &&
vstart <= va->va_start) {
/*
* Shift the vstart forward. Please note, we update it with
@@ -1280,7 +1292,7 @@ find_vmap_lowest_match_check(unsigned long size, unsigned long align)
get_random_bytes(&rnd, sizeof(rnd));
vstart = VMALLOC_START + rnd;
- va_1 = find_vmap_lowest_match(size, align, vstart);
+ va_1 = find_vmap_lowest_match(size, align, vstart, false);
va_2 = find_vmap_lowest_linear_match(size, align, vstart);
if (va_1 != va_2)
@@ -1431,12 +1443,25 @@ static __always_inline unsigned long
__alloc_vmap_area(unsigned long size, unsigned long align,
unsigned long vstart, unsigned long vend)
{
+ bool adjust_search_size = true;
unsigned long nva_start_addr;
struct vmap_area *va;
enum fit_type type;
int ret;
- va = find_vmap_lowest_match(size, align, vstart);
+ /*
+ * Do not adjust when:
+ * a) align <= PAGE_SIZE, because it does not make any sense.
+ * All blocks(their start addresses) are at least PAGE_SIZE
+ * aligned anyway;
+ * b) a short range where a requested size corresponds to exactly
+ * specified [vstart:vend] interval and an alignment > PAGE_SIZE.
+ * With adjusted search length an allocation would not succeed.
+ */
+ if (align <= PAGE_SIZE || (align > PAGE_SIZE && (vend - vstart) == size))
+ adjust_search_size = false;
+
+ va = find_vmap_lowest_match(size, align, vstart, adjust_search_size);
if (unlikely(!va))
return vend;
@@ -1720,18 +1745,6 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
}
/*
- * Kick off a purge of the outstanding lazy areas. Don't bother if somebody
- * is already purging.
- */
-static void try_purge_vmap_area_lazy(void)
-{
- if (mutex_trylock(&vmap_purge_lock)) {
- __purge_vmap_area_lazy(ULONG_MAX, 0);
- mutex_unlock(&vmap_purge_lock);
- }
-}
-
-/*
* Kick off a purge of the outstanding lazy areas.
*/
static void purge_vmap_area_lazy(void)
@@ -1742,6 +1755,20 @@ static void purge_vmap_area_lazy(void)
mutex_unlock(&vmap_purge_lock);
}
+static void drain_vmap_area_work(struct work_struct *work)
+{
+ unsigned long nr_lazy;
+
+ do {
+ mutex_lock(&vmap_purge_lock);
+ __purge_vmap_area_lazy(ULONG_MAX, 0);
+ mutex_unlock(&vmap_purge_lock);
+
+ /* Recheck if further work is required. */
+ nr_lazy = atomic_long_read(&vmap_lazy_nr);
+ } while (nr_lazy > lazy_max_pages());
+}
+
/*
* Free a vmap area, caller ensuring that the area has been unmapped
* and flush_cache_vunmap had been called for the correct range
@@ -1768,7 +1795,7 @@ static void free_vmap_area_noflush(struct vmap_area *va)
/* After this point, we may free va at any time */
if (unlikely(nr_lazy > lazy_max_pages()))
- try_purge_vmap_area_lazy();
+ schedule_work(&drain_vmap_work);
}
/*
@@ -2145,7 +2172,7 @@ EXPORT_SYMBOL_GPL(vm_unmap_aliases);
void vm_unmap_ram(const void *mem, unsigned int count)
{
unsigned long size = (unsigned long)count << PAGE_SHIFT;
- unsigned long addr = (unsigned long)mem;
+ unsigned long addr = (unsigned long)kasan_reset_tag(mem);
struct vmap_area *va;
might_sleep();
@@ -2206,14 +2233,19 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node)
mem = (void *)addr;
}
- kasan_unpoison_vmalloc(mem, size);
-
if (vmap_pages_range(addr, addr + size, PAGE_KERNEL,
pages, PAGE_SHIFT) < 0) {
vm_unmap_ram(mem, count);
return NULL;
}
+ /*
+ * Mark the pages as accessible, now that they are mapped.
+ * With hardware tag-based KASAN, marking is skipped for
+ * non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc().
+ */
+ mem = kasan_unpoison_vmalloc(mem, size, KASAN_VMALLOC_PROT_NORMAL);
+
return mem;
}
EXPORT_SYMBOL(vm_map_ram);
@@ -2439,10 +2471,20 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
return NULL;
}
- kasan_unpoison_vmalloc((void *)va->va_start, requested_size);
-
setup_vmalloc_vm(area, va, flags, caller);
+ /*
+ * Mark pages for non-VM_ALLOC mappings as accessible. Do it now as a
+ * best-effort approach, as they can be mapped outside of vmalloc code.
+ * For VM_ALLOC mappings, the pages are marked as accessible after
+ * getting mapped in __vmalloc_node_range().
+ * With hardware tag-based KASAN, marking is skipped for
+ * non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc().
+ */
+ if (!(flags & VM_ALLOC))
+ area->addr = kasan_unpoison_vmalloc(area->addr, requested_size,
+ KASAN_VMALLOC_PROT_NORMAL);
+
return area;
}
@@ -2526,7 +2568,7 @@ struct vm_struct *remove_vm_area(const void *addr)
va->vm = NULL;
spin_unlock(&vmap_area_lock);
- kasan_free_shadow(vm);
+ kasan_free_module_shadow(vm);
free_unmap_vmap_area(va);
return vm;
@@ -2925,7 +2967,6 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
int node)
{
const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
- const gfp_t orig_gfp_mask = gfp_mask;
bool nofail = gfp_mask & __GFP_NOFAIL;
unsigned long addr = (unsigned long)area->addr;
unsigned long size = get_vm_area_size(area);
@@ -2949,7 +2990,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
}
if (!area->pages) {
- warn_alloc(orig_gfp_mask, NULL,
+ warn_alloc(gfp_mask, NULL,
"vmalloc error: size %lu, failed to allocated page array size %lu",
nr_small_pages * PAGE_SIZE, array_size);
free_vm_area(area);
@@ -2959,8 +3000,8 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
set_vm_area_page_order(area, page_shift - PAGE_SHIFT);
page_order = vm_area_page_order(area);
- area->nr_pages = vm_area_alloc_pages(gfp_mask, node,
- page_order, nr_small_pages, area->pages);
+ area->nr_pages = vm_area_alloc_pages(gfp_mask | __GFP_NOWARN,
+ node, page_order, nr_small_pages, area->pages);
atomic_long_add(area->nr_pages, &nr_vmalloc_pages);
if (gfp_mask & __GFP_ACCOUNT) {
@@ -2976,7 +3017,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
* allocation request, free them via __vfree() if any.
*/
if (area->nr_pages != nr_small_pages) {
- warn_alloc(orig_gfp_mask, NULL,
+ warn_alloc(gfp_mask, NULL,
"vmalloc error: size %lu, page order %u, failed to allocate pages",
area->nr_pages * PAGE_SIZE, page_order);
goto fail;
@@ -3004,7 +3045,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
memalloc_noio_restore(flags);
if (ret < 0) {
- warn_alloc(orig_gfp_mask, NULL,
+ warn_alloc(gfp_mask, NULL,
"vmalloc error: size %lu, failed to map pages",
area->nr_pages * PAGE_SIZE);
goto fail;
@@ -3051,7 +3092,8 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
const void *caller)
{
struct vm_struct *area;
- void *addr;
+ void *ret;
+ kasan_vmalloc_flags_t kasan_flags = KASAN_VMALLOC_NONE;
unsigned long real_size = size;
unsigned long real_align = align;
unsigned int shift = PAGE_SHIFT;
@@ -3104,11 +3146,51 @@ again:
goto fail;
}
- addr = __vmalloc_area_node(area, gfp_mask, prot, shift, node);
- if (!addr)
+ /*
+ * Prepare arguments for __vmalloc_area_node() and
+ * kasan_unpoison_vmalloc().
+ */
+ if (pgprot_val(prot) == pgprot_val(PAGE_KERNEL)) {
+ if (kasan_hw_tags_enabled()) {
+ /*
+ * Modify protection bits to allow tagging.
+ * This must be done before mapping.
+ */
+ prot = arch_vmap_pgprot_tagged(prot);
+
+ /*
+ * Skip page_alloc poisoning and zeroing for physical
+ * pages backing VM_ALLOC mapping. Memory is instead
+ * poisoned and zeroed by kasan_unpoison_vmalloc().
+ */
+ gfp_mask |= __GFP_SKIP_KASAN_UNPOISON | __GFP_SKIP_ZERO;
+ }
+
+ /* Take note that the mapping is PAGE_KERNEL. */
+ kasan_flags |= KASAN_VMALLOC_PROT_NORMAL;
+ }
+
+ /* Allocate physical pages and map them into vmalloc space. */
+ ret = __vmalloc_area_node(area, gfp_mask, prot, shift, node);
+ if (!ret)
goto fail;
/*
+ * Mark the pages as accessible, now that they are mapped.
+ * The init condition should match the one in post_alloc_hook()
+ * (except for the should_skip_init() check) to make sure that memory
+ * is initialized under the same conditions regardless of the enabled
+ * KASAN mode.
+ * Tag-based KASAN modes only assign tags to normal non-executable
+ * allocations, see __kasan_unpoison_vmalloc().
+ */
+ kasan_flags |= KASAN_VMALLOC_VM_ALLOC;
+ if (!want_init_on_free() && want_init_on_alloc(gfp_mask))
+ kasan_flags |= KASAN_VMALLOC_INIT;
+ /* KASAN_VMALLOC_PROT_NORMAL already set if required. */
+ area->addr = kasan_unpoison_vmalloc(area->addr, real_size, kasan_flags);
+
+ /*
* In this function, newly allocated vm_struct has VM_UNINITIALIZED
* flag. It means that vm_struct is not fully initialized.
* Now, it is fully initialized, so remove this flag here.
@@ -3119,7 +3201,7 @@ again:
if (!(vm_flags & VM_DEFER_KMEMLEAK))
kmemleak_vmalloc(area, size, gfp_mask);
- return addr;
+ return area->addr;
fail:
if (shift > PAGE_SHIFT) {
@@ -3404,6 +3486,8 @@ long vread(char *buf, char *addr, unsigned long count)
unsigned long buflen = count;
unsigned long n;
+ addr = kasan_reset_tag(addr);
+
/* Don't allow overflow */
if ((unsigned long) addr + count < count)
count = -(unsigned long) addr;
@@ -3789,9 +3873,6 @@ retry:
for (area = 0; area < nr_vms; area++) {
if (kasan_populate_vmalloc(vas[area]->va_start, sizes[area]))
goto err_free_shadow;
-
- kasan_unpoison_vmalloc((void *)vas[area]->va_start,
- sizes[area]);
}
/* insert all vm's */
@@ -3804,6 +3885,16 @@ retry:
}
spin_unlock(&vmap_area_lock);
+ /*
+ * Mark allocated areas as accessible. Do it now as a best-effort
+ * approach, as they can be mapped outside of vmalloc code.
+ * With hardware tag-based KASAN, marking is skipped for
+ * non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc().
+ */
+ for (area = 0; area < nr_vms; area++)
+ vms[area]->addr = kasan_unpoison_vmalloc(vms[area]->addr,
+ vms[area]->size, KASAN_VMALLOC_PROT_NORMAL);
+
kfree(vas);
return vms;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 090bfb605ecf0..5f471c1e279fe 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -56,6 +56,7 @@
#include <linux/swapops.h>
#include <linux/balloon_compaction.h>
+#include <linux/sched/sysctl.h>
#include "internal.h"
@@ -989,17 +990,6 @@ static inline int is_page_cache_freeable(struct page *page)
return page_count(page) - page_has_private(page) == 1 + page_cache_pins;
}
-static int may_write_to_inode(struct inode *inode)
-{
- if (current->flags & PF_SWAPWRITE)
- return 1;
- if (!inode_write_congested(inode))
- return 1;
- if (inode_to_bdi(inode) == current->backing_dev_info)
- return 1;
- return 0;
-}
-
/*
* We detected a synchronous write error writing a page out. Probably
* -ENOSPC. We need to propagate that into the address_space for a subsequent
@@ -1066,8 +1056,10 @@ void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason)
* forward progress (e.g. journalling workqueues or kthreads).
*/
if (!current_is_kswapd() &&
- current->flags & (PF_IO_WORKER|PF_KTHREAD))
+ current->flags & (PF_IO_WORKER|PF_KTHREAD)) {
+ cond_resched();
return;
+ }
/*
* These figures are pulled out of thin air.
@@ -1199,8 +1191,6 @@ static pageout_t pageout(struct page *page, struct address_space *mapping)
}
if (mapping->a_ops->writepage == NULL)
return PAGE_ACTIVATE;
- if (!may_write_to_inode(mapping->host))
- return PAGE_KEEP;
if (clear_page_dirty_for_io(page)) {
int res;
@@ -1386,11 +1376,11 @@ static enum page_references page_check_references(struct page *page,
referenced_page = TestClearPageReferenced(page);
/*
- * Mlock lost the isolation race with us. Let try_to_unmap()
- * move the page to the unevictable list.
+ * The supposedly reclaimable page was found to be in a VM_LOCKED vma.
+ * Let the page, now marked Mlocked, be moved to the unevictable list.
*/
if (vm_flags & VM_LOCKED)
- return PAGEREF_RECLAIM;
+ return PAGEREF_ACTIVATE;
if (referenced_ptes) {
/*
@@ -1576,9 +1566,7 @@ retry:
* end of the LRU a second time.
*/
mapping = page_mapping(page);
- if (((dirty || writeback) && mapping &&
- inode_write_congested(mapping->host)) ||
- (writeback && PageReclaim(page)))
+ if (writeback && PageReclaim(page))
stat->nr_congested++;
/*
@@ -1729,7 +1717,7 @@ retry:
mapping = page_mapping(page);
}
} else if (unlikely(PageTransHuge(page))) {
- /* Split file THP */
+ /* Split file/lazyfree THP */
if (split_huge_page_to_list(page, page_list))
goto keep_locked;
}
@@ -2377,9 +2365,7 @@ static unsigned int move_pages_to_lru(struct lruvec *lruvec,
*/
static int current_may_throttle(void)
{
- return !(current->flags & PF_LOCAL_THROTTLE) ||
- current->backing_dev_info == NULL ||
- bdi_write_congested(current->backing_dev_info);
+ return !(current->flags & PF_LOCAL_THROTTLE);
}
/*
@@ -3956,6 +3942,13 @@ static bool pgdat_watermark_boosted(pg_data_t *pgdat, int highest_zoneidx)
}
/*
+ * Keep the free pages on fast memory node a little more than the high
+ * watermark to accommodate the promoted pages.
+ */
+#define NUMA_BALANCING_PROMOTE_WATERMARK_DIV 4
+#define NUMA_BALANCING_PROMOTE_WATERMARK_MIN (10UL * 1024 * 1024 >> PAGE_SHIFT)
+
+/*
* Returns true if there is an eligible zone balanced for the request order
* and highest_zoneidx
*/
@@ -3976,6 +3969,15 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int highest_zoneidx)
continue;
mark = high_wmark_pages(zone);
+ if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING &&
+ numa_demotion_enabled &&
+ next_demotion_node(pgdat->node_id) != NUMA_NO_NODE) {
+ unsigned long promote_mark;
+
+ promote_mark = max(NUMA_BALANCING_PROMOTE_WATERMARK_MIN,
+ mark / NUMA_BALANCING_PROMOTE_WATERMARK_DIV);
+ mark += promote_mark;
+ }
if (zone_watermark_ok_safe(zone, order, mark, highest_zoneidx))
return true;
}
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 4057372745d04..d5cc8d739fac1 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1242,6 +1242,9 @@ const char * const vmstat_text[] = {
#ifdef CONFIG_SWAP
"nr_swapcached",
#endif
+#ifdef CONFIG_NUMA_BALANCING
+ "pgpromote_success",
+#endif
/* enum writeback_stat_item counters */
"nr_dirty_threshold",
@@ -1385,6 +1388,9 @@ const char * const vmstat_text[] = {
#ifdef CONFIG_SWAP
"swap_ra",
"swap_ra_hit",
+#ifdef CONFIG_KSM
+ "ksm_swpin_copy",
+#endif
#endif
#ifdef CONFIG_X86
"direct_map_level2_splits",
diff --git a/mm/zswap.c b/mm/zswap.c
index cdf6950fcb2e3..3efd8cae315e7 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -120,11 +120,19 @@ static unsigned int zswap_accept_thr_percent = 90; /* of max pool size */
module_param_named(accept_threshold_percent, zswap_accept_thr_percent,
uint, 0644);
-/* Enable/disable handling same-value filled pages (enabled by default) */
+/*
+ * Enable/disable handling same-value filled pages (enabled by default).
+ * If disabled every page is considered non-same-value filled.
+ */
static bool zswap_same_filled_pages_enabled = true;
module_param_named(same_filled_pages_enabled, zswap_same_filled_pages_enabled,
bool, 0644);
+/* Enable/disable handling non-same-value filled pages (enabled by default) */
+static bool zswap_non_same_filled_pages_enabled = true;
+module_param_named(non_same_filled_pages_enabled, zswap_non_same_filled_pages_enabled,
+ bool, 0644);
+
/*********************************
* data structures
**********************************/
@@ -1147,6 +1155,11 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
kunmap_atomic(src);
}
+ if (!zswap_non_same_filled_pages_enabled) {
+ ret = -EINVAL;
+ goto freepage;
+ }
+
/* if entry is successfully added, it keeps the reference */
entry->pool = zswap_pool_current_get();
if (!entry->pool) {
diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index b01c36a15d9dd..071b3304f5d7c 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -584,8 +584,7 @@ our $typeTypedefs = qr{(?x:
our $zero_initializer = qr{(?:(?:0[xX])?0+$Int_type?|NULL|false)\b};
-our $logFunctions = qr{(?x:
- printk(?:_ratelimited|_once|_deferred_once|_deferred|)|
+our $logFunctionsCore = qr{(?x:
(?:[a-z0-9]+_){1,2}(?:printk|emerg|alert|crit|err|warning|warn|notice|info|debug|dbg|vdbg|devel|cont|WARN)(?:_ratelimited|_once|)|
TP_printk|
WARN(?:_RATELIMIT|_ONCE|)|
@@ -594,6 +593,11 @@ our $logFunctions = qr{(?x:
seq_vprintf|seq_printf|seq_puts
)};
+our $logFunctions = qr{(?x:
+ printk(?:_ratelimited|_once|_deferred_once|_deferred|)|
+ $logFunctionsCore
+)};
+
our $allocFunctions = qr{(?x:
(?:(?:devm_)?
(?:kv|k|v)[czm]alloc(?:_array)?(?:_node)? |
@@ -5551,6 +5555,7 @@ sub process {
defined($stat) && defined($cond) &&
$line =~ /\b(?:if|while|for)\s*\(/ && $line !~ /^.\s*#/) {
my ($s, $c) = ($stat, $cond);
+ my $fixed_assign_in_if = 0;
if ($c =~ /\bif\s*\(.*[^<>!=]=[^=].*/s) {
if (ERROR("ASSIGN_IN_IF",
@@ -5575,6 +5580,7 @@ sub process {
$newline .= ')';
$newline .= " {" if (defined($brace));
fix_insert_line($fixlinenr + 1, $newline);
+ $fixed_assign_in_if = 1;
}
}
}
@@ -5598,8 +5604,20 @@ sub process {
$stat_real = "[...]\n$stat_real";
}
- ERROR("TRAILING_STATEMENTS",
- "trailing statements should be on next line\n" . $herecurr . $stat_real);
+ if (ERROR("TRAILING_STATEMENTS",
+ "trailing statements should be on next line\n" . $herecurr . $stat_real) &&
+ !$fixed_assign_in_if &&
+ $cond_lines == 0 &&
+ $fix && $perl_version_ok &&
+ $fixed[$fixlinenr] =~ /^\+(\s*)((?:if|while|for)\s*$balanced_parens)\s*(.*)$/) {
+ my $indent = $1;
+ my $test = $2;
+ my $rest = rtrim($4);
+ if ($rest =~ /;$/) {
+ $fixed[$fixlinenr] = "\+$indent$test";
+ fix_insert_line($fixlinenr + 1, "$indent\t$rest");
+ }
+ }
}
}
@@ -6298,8 +6316,7 @@ sub process {
}
# check for logging functions with KERN_<LEVEL>
- if ($line !~ /printk(?:_ratelimited|_once)?\s*\(/ &&
- $line =~ /\b$logFunctions\s*\(.*\b(KERN_[A-Z]+)\b/) {
+ if ($line =~ /\b$logFunctionsCore\s*\(.*\b(KERN_[A-Z]+)\b/) {
my $level = $1;
if (WARN("UNNECESSARY_KERN_LEVEL",
"Possible unnecessary $level\n" . $herecurr) &&
@@ -7418,6 +7435,13 @@ sub process {
WARN("MODULE_LICENSE",
"unknown module license " . $extracted_string . "\n" . $herecurr);
}
+ if (!$file && $extracted_string eq '"GPL v2"') {
+ if (WARN("MODULE_LICENSE",
+ "Prefer \"GPL\" over \"GPL v2\" - see commit bf7fbeeae6db (\"module: Cure the MODULE_LICENSE \"GPL\" vs. \"GPL v2\" bogosity\")\n" . $herecurr) &&
+ $fix) {
+ $fixed[$fixlinenr] =~ s/\bMODULE_LICENSE\s*\(\s*"GPL v2"\s*\)/MODULE_LICENSE("GPL")/;
+ }
+ }
}
# check for sysctl duplicate constants
diff --git a/tools/include/linux/gfp.h b/tools/include/linux/gfp.h
index b238dbc9eb858..56eec4445bc9e 100644
--- a/tools/include/linux/gfp.h
+++ b/tools/include/linux/gfp.h
@@ -12,7 +12,6 @@
#define __GFP_FS 0x80u
#define __GFP_NOWARN 0x200u
#define __GFP_ZERO 0x8000u
-#define __GFP_ATOMIC 0x80000u
#define __GFP_ACCOUNT 0x100000u
#define __GFP_DIRECT_RECLAIM 0x400000u
#define __GFP_KSWAPD_RECLAIM 0x2000000u
@@ -20,7 +19,7 @@
#define __GFP_RECLAIM (__GFP_DIRECT_RECLAIM | __GFP_KSWAPD_RECLAIM)
#define GFP_ZONEMASK 0x0fu
-#define GFP_ATOMIC (__GFP_HIGH | __GFP_ATOMIC | __GFP_KSWAPD_RECLAIM)
+#define GFP_ATOMIC (__GFP_HIGH | __GFP_KSWAPD_RECLAIM)
#define GFP_KERNEL (__GFP_RECLAIM | __GFP_IO | __GFP_FS)
#define GFP_NOWAIT (__GFP_KSWAPD_RECLAIM)
diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c
index 99d7ff9a8effe..e5b38d0b08fb5 100644
--- a/tools/perf/builtin-kmem.c
+++ b/tools/perf/builtin-kmem.c
@@ -640,7 +640,6 @@ static const struct {
{ "__GFP_HIGHMEM", "HM" },
{ "GFP_DMA32", "D32" },
{ "__GFP_HIGH", "H" },
- { "__GFP_ATOMIC", "_A" },
{ "__GFP_IO", "I" },
{ "__GFP_FS", "F" },
{ "__GFP_NOWARN", "NWR" },
diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile
index d6a54a1d9d3a1..4eda7c7c15694 100644
--- a/tools/testing/selftests/Makefile
+++ b/tools/testing/selftests/Makefile
@@ -115,23 +115,35 @@ ifdef building_out_of_srctree
override LDFLAGS =
endif
-ifneq ($(O),)
- BUILD := $(O)/kselftest
+top_srcdir ?= ../../..
+
+ifeq ("$(origin O)", "command line")
+ KBUILD_OUTPUT := $(O)
+endif
+
+ifneq ($(KBUILD_OUTPUT),)
+ # Make's built-in functions such as $(abspath ...), $(realpath ...) cannot
+ # expand a shell special character '~'. We use a somewhat tedious way here.
+ abs_objtree := $(shell cd $(top_srcdir) && mkdir -p $(KBUILD_OUTPUT) && cd $(KBUILD_OUTPUT) && pwd)
+ $(if $(abs_objtree),, \
+ $(error failed to create output directory "$(KBUILD_OUTPUT)"))
+ # $(realpath ...) resolves symlinks
+ abs_objtree := $(realpath $(abs_objtree))
+ BUILD := $(abs_objtree)/kselftest
+ KHDR_INCLUDES := -I${abs_objtree}/usr/include
else
- ifneq ($(KBUILD_OUTPUT),)
- BUILD := $(KBUILD_OUTPUT)/kselftest
- else
- BUILD := $(shell pwd)
- DEFAULT_INSTALL_HDR_PATH := 1
- endif
+ BUILD := $(CURDIR)
+ abs_srctree := $(shell cd $(top_srcdir) && pwd)
+ KHDR_INCLUDES := -I${abs_srctree}/usr/include
+ DEFAULT_INSTALL_HDR_PATH := 1
endif
# Prepare for headers install
-top_srcdir ?= ../../..
include $(top_srcdir)/scripts/subarch.include
ARCH ?= $(SUBARCH)
export KSFT_KHDR_INSTALL_DONE := 1
export BUILD
+export KHDR_INCLUDES
# set default goal to all, so make without a target runs all, even when
# all isn't the first target in the file.
@@ -156,7 +168,7 @@ khdr:
ifeq (1,$(DEFAULT_INSTALL_HDR_PATH))
$(MAKE) --no-builtin-rules ARCH=$(ARCH) -C $(top_srcdir) headers_install
else
- $(MAKE) --no-builtin-rules INSTALL_HDR_PATH=$$BUILD/usr \
+ $(MAKE) --no-builtin-rules INSTALL_HDR_PATH=$(abs_objtree)/usr \
ARCH=$(ARCH) -C $(top_srcdir) headers_install
endif
diff --git a/tools/testing/selftests/exec/Makefile b/tools/testing/selftests/exec/Makefile
index 12c5e27d32c16..551affb437fe1 100644
--- a/tools/testing/selftests/exec/Makefile
+++ b/tools/testing/selftests/exec/Makefile
@@ -10,6 +10,7 @@ TEST_GEN_FILES := execveat.symlink execveat.denatured script subdir
TEST_FILES := Makefile
TEST_GEN_PROGS += recursion-depth
+TEST_GEN_PROGS += null-argv
EXTRA_CLEAN := $(OUTPUT)/subdir.moved $(OUTPUT)/execveat.moved $(OUTPUT)/xxxxx* \
$(OUTPUT)/S_I*.test
diff --git a/tools/testing/selftests/exec/null-argv.c b/tools/testing/selftests/exec/null-argv.c
new file mode 100644
index 0000000000000..c19726e710d19
--- /dev/null
+++ b/tools/testing/selftests/exec/null-argv.c
@@ -0,0 +1,78 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Test that empty argvs are swapped out for a single empty string. */
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+
+#include "../kselftest.h"
+
+#define FORK(exec) \
+do { \
+ pid = fork(); \
+ if (pid == 0) { \
+ /* Child */ \
+ exec; /* Some kind of exec */ \
+ perror("# " #exec); \
+ return 1; \
+ } \
+ check_result(pid, #exec); \
+} while (0)
+
+void check_result(pid_t pid, const char *msg)
+{
+ int wstatus;
+
+ if (pid == (pid_t)-1) {
+ perror("# fork");
+ ksft_test_result_fail("fork failed: %s\n", msg);
+ return;
+ }
+ if (waitpid(pid, &wstatus, 0) < 0) {
+ perror("# waitpid");
+ ksft_test_result_fail("waitpid failed: %s\n", msg);
+ return;
+ }
+ if (!WIFEXITED(wstatus)) {
+ ksft_test_result_fail("child did not exit: %s\n", msg);
+ return;
+ }
+ if (WEXITSTATUS(wstatus) != 0) {
+ ksft_test_result_fail("non-zero exit: %s\n", msg);
+ return;
+ }
+ ksft_test_result_pass("%s\n", msg);
+}
+
+int main(int argc, char *argv[], char *envp[])
+{
+ pid_t pid;
+ static char * const args[] = { NULL };
+ static char * const str[] = { "", NULL };
+
+ /* argc counting checks */
+ if (argc < 1) {
+ fprintf(stderr, "# FAIL: saw argc == 0 (old kernel?)\n");
+ return 1;
+ }
+ if (argc != 1) {
+ fprintf(stderr, "# FAIL: unknown argc (%d)\n", argc);
+ return 1;
+ }
+ if (argv[0][0] == '\0') {
+ /* Good, we found a NULL terminated string at argv[0]! */
+ return 0;
+ }
+
+ /* Test runner. */
+ ksft_print_header();
+ ksft_set_plan(5);
+
+ FORK(execve(argv[0], str, NULL));
+ FORK(execve(argv[0], NULL, NULL));
+ FORK(execve(argv[0], NULL, envp));
+ FORK(execve(argv[0], args, NULL));
+ FORK(execve(argv[0], args, envp));
+
+ ksft_exit(ksft_cnt.ksft_pass == ksft_plan);
+}
diff --git a/tools/testing/selftests/futex/functional/Makefile b/tools/testing/selftests/futex/functional/Makefile
index 9a8c3700d7738..b8152c573e8aa 100644
--- a/tools/testing/selftests/futex/functional/Makefile
+++ b/tools/testing/selftests/futex/functional/Makefile
@@ -1,7 +1,6 @@
# SPDX-License-Identifier: GPL-2.0
-INCLUDES := -I../include -I../../ -I../../../../../usr/include/ \
- -I$(KBUILD_OUTPUT)/kselftest/usr/include
-CFLAGS := $(CFLAGS) -g -O2 -Wall -D_GNU_SOURCE -pthread $(INCLUDES)
+INCLUDES := -I../include -I../../ -I../../../../../usr/include/
+CFLAGS := $(CFLAGS) -g -O2 -Wall -D_GNU_SOURCE -pthread $(INCLUDES) $(KHDR_INCLUDES)
LDLIBS := -lpthread -lrt
HEADERS := \
diff --git a/tools/testing/selftests/kselftest.h b/tools/testing/selftests/kselftest.h
index f1180987492c9..b8f248018174d 100644
--- a/tools/testing/selftests/kselftest.h
+++ b/tools/testing/selftests/kselftest.h
@@ -28,6 +28,7 @@
*
* When all tests are finished, clean up and exit the program with one of:
*
+ * ksft_finished();
* ksft_exit(condition);
* ksft_exit_pass();
* ksft_exit_fail();
@@ -235,6 +236,15 @@ static inline int ksft_exit_fail(void)
ksft_exit_fail(); \
} while (0)
+/**
+ * ksft_finished() - Exit selftest with success if all tests passed
+ */
+#define ksft_finished() \
+ ksft_exit(ksft_plan == \
+ ksft_cnt.ksft_pass + \
+ ksft_cnt.ksft_xfail + \
+ ksft_cnt.ksft_xskip)
+
static inline int ksft_exit_fail_msg(const char *msg, ...)
{
int saved_errno = errno;
diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile
index a1ce7dd40e3ef..1771e188dd448 100644
--- a/tools/testing/selftests/kvm/Makefile
+++ b/tools/testing/selftests/kvm/Makefile
@@ -151,7 +151,7 @@ endif
CFLAGS += -Wall -Wstrict-prototypes -Wuninitialized -O2 -g -std=gnu99 \
-fno-stack-protector -fno-PIE -I$(LINUX_TOOL_INCLUDE) \
-I$(LINUX_TOOL_ARCH_INCLUDE) -I$(LINUX_HDR_PATH) -Iinclude \
- -I$(<D) -Iinclude/$(UNAME_M) -I.. $(EXTRA_CFLAGS)
+ -I$(<D) -Iinclude/$(UNAME_M) -I.. $(EXTRA_CFLAGS) $(KHDR_INCLUDES)
no-pie-option := $(call try-run, echo 'int main() { return 0; }' | \
$(CC) -Werror -no-pie -x c - -o "$$TMP", -no-pie)
diff --git a/tools/testing/selftests/landlock/Makefile b/tools/testing/selftests/landlock/Makefile
index a99596ca9882b..0b0049e133bba 100644
--- a/tools/testing/selftests/landlock/Makefile
+++ b/tools/testing/selftests/landlock/Makefile
@@ -1,6 +1,6 @@
# SPDX-License-Identifier: GPL-2.0
-CFLAGS += -Wall -O2
+CFLAGS += -Wall -O2 $(KHDR_INCLUDES)
src_test := $(wildcard *_test.c)
diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile
index 9897fa9ab9537..0b1488616c551 100644
--- a/tools/testing/selftests/net/Makefile
+++ b/tools/testing/selftests/net/Makefile
@@ -2,7 +2,7 @@
# Makefile for net selftests
CFLAGS = -Wall -Wl,--no-as-needed -O2 -g
-CFLAGS += -I../../../../usr/include/
+CFLAGS += -I../../../../usr/include/ $(KHDR_INCLUDES)
TEST_PROGS := run_netsocktests run_afpackettests test_bpf.sh netdevice.sh \
rtnetlink.sh xfrm_policy.sh test_blackhole_dev.sh
diff --git a/tools/testing/selftests/net/mptcp/Makefile b/tools/testing/selftests/net/mptcp/Makefile
index 0356c4501c990..f905d5358e681 100644
--- a/tools/testing/selftests/net/mptcp/Makefile
+++ b/tools/testing/selftests/net/mptcp/Makefile
@@ -3,7 +3,7 @@
top_srcdir = ../../../../..
KSFT_KHDR_INSTALL := 1
-CFLAGS = -Wall -Wl,--no-as-needed -O2 -g -I$(top_srcdir)/usr/include
+CFLAGS = -Wall -Wl,--no-as-needed -O2 -g -I$(top_srcdir)/usr/include $(KHDR_INCLUDES)
TEST_PROGS := mptcp_connect.sh pm_netlink.sh mptcp_join.sh diag.sh \
simult_flows.sh mptcp_sockopt.sh
diff --git a/tools/testing/selftests/vm/.gitignore b/tools/testing/selftests/vm/.gitignore
index 2e7e86e852828..3b5faec3c04f4 100644
--- a/tools/testing/selftests/vm/.gitignore
+++ b/tools/testing/selftests/vm/.gitignore
@@ -2,6 +2,7 @@
hugepage-mmap
hugepage-mremap
hugepage-shm
+hugepage-vmemmap
khugepaged
map_hugetlb
map_populate
diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile
index 1607322a112c9..32032c7180205 100644
--- a/tools/testing/selftests/vm/Makefile
+++ b/tools/testing/selftests/vm/Makefile
@@ -23,7 +23,7 @@ MACHINE ?= $(shell echo $(uname_M) | sed -e 's/aarch64.*/arm64/' -e 's/ppc64.*/p
# LDLIBS.
MAKEFLAGS += --no-builtin-rules
-CFLAGS = -Wall -I ../../../../usr/include $(EXTRA_CFLAGS)
+CFLAGS = -Wall -I ../../../../usr/include $(EXTRA_CFLAGS) $(KHDR_INCLUDES)
LDLIBS = -lrt -lpthread
TEST_GEN_FILES = compaction_test
TEST_GEN_FILES += gup_test
@@ -31,6 +31,7 @@ TEST_GEN_FILES += hmm-tests
TEST_GEN_FILES += hugepage-mmap
TEST_GEN_FILES += hugepage-mremap
TEST_GEN_FILES += hugepage-shm
+TEST_GEN_FILES += hugepage-vmemmap
TEST_GEN_FILES += khugepaged
TEST_GEN_FILES += madv_populate
TEST_GEN_FILES += map_fixed_noreplace
@@ -142,7 +143,7 @@ $(OUTPUT)/mlock-random-test $(OUTPUT)/memfd_secret: LDLIBS += -lcap
$(OUTPUT)/gup_test: ../../../../mm/gup_test.h
-$(OUTPUT)/hmm-tests: local_config.h
+$(OUTPUT)/hmm-tests: local_config.h ../../../../mm/gup_test.h
# HMM_EXTRA_LIBS may get set in local_config.mk, or it may be left empty.
$(OUTPUT)/hmm-tests: LDLIBS += $(HMM_EXTRA_LIBS)
diff --git a/tools/testing/selftests/vm/hmm-tests.c b/tools/testing/selftests/vm/hmm-tests.c
index 203323967b507..11b83a8084fee 100644
--- a/tools/testing/selftests/vm/hmm-tests.c
+++ b/tools/testing/selftests/vm/hmm-tests.c
@@ -36,6 +36,7 @@
* in the usual include/uapi/... directory.
*/
#include "../../../../lib/test_hmm_uapi.h"
+#include "../../../../mm/gup_test.h"
struct hmm_buffer {
void *ptr;
@@ -44,6 +45,14 @@ struct hmm_buffer {
int fd;
uint64_t cpages;
uint64_t faults;
+ int zone_device_type;
+};
+
+enum {
+ HMM_PRIVATE_DEVICE_ONE,
+ HMM_PRIVATE_DEVICE_TWO,
+ HMM_COHERENCE_DEVICE_ONE,
+ HMM_COHERENCE_DEVICE_TWO,
};
#define TWOMEG (1 << 21)
@@ -52,6 +61,8 @@ struct hmm_buffer {
#define NTIMES 10
#define ALIGN(x, a) (((x) + (a - 1)) & (~((a) - 1)))
+/* Just the flags we need, copied from mm.h: */
+#define FOLL_WRITE 0x01 /* check pte is writable */
FIXTURE(hmm)
{
@@ -60,6 +71,21 @@ FIXTURE(hmm)
unsigned int page_shift;
};
+FIXTURE_VARIANT(hmm)
+{
+ int device_number;
+};
+
+FIXTURE_VARIANT_ADD(hmm, hmm_device_private)
+{
+ .device_number = HMM_PRIVATE_DEVICE_ONE,
+};
+
+FIXTURE_VARIANT_ADD(hmm, hmm_device_coherent)
+{
+ .device_number = HMM_COHERENCE_DEVICE_ONE,
+};
+
FIXTURE(hmm2)
{
int fd0;
@@ -68,6 +94,24 @@ FIXTURE(hmm2)
unsigned int page_shift;
};
+FIXTURE_VARIANT(hmm2)
+{
+ int device_number0;
+ int device_number1;
+};
+
+FIXTURE_VARIANT_ADD(hmm2, hmm2_device_private)
+{
+ .device_number0 = HMM_PRIVATE_DEVICE_ONE,
+ .device_number1 = HMM_PRIVATE_DEVICE_TWO,
+};
+
+FIXTURE_VARIANT_ADD(hmm2, hmm2_device_coherent)
+{
+ .device_number0 = HMM_COHERENCE_DEVICE_ONE,
+ .device_number1 = HMM_COHERENCE_DEVICE_TWO,
+};
+
static int hmm_open(int unit)
{
char pathname[HMM_PATH_MAX];
@@ -81,12 +125,19 @@ static int hmm_open(int unit)
return fd;
}
+static bool hmm_is_coherent_type(int dev_num)
+{
+ return (dev_num >= HMM_COHERENCE_DEVICE_ONE);
+}
+
FIXTURE_SETUP(hmm)
{
self->page_size = sysconf(_SC_PAGE_SIZE);
self->page_shift = ffs(self->page_size) - 1;
- self->fd = hmm_open(0);
+ self->fd = hmm_open(variant->device_number);
+ if (self->fd < 0 && hmm_is_coherent_type(variant->device_number))
+ SKIP(exit(0), "DEVICE_COHERENT not available");
ASSERT_GE(self->fd, 0);
}
@@ -95,9 +146,11 @@ FIXTURE_SETUP(hmm2)
self->page_size = sysconf(_SC_PAGE_SIZE);
self->page_shift = ffs(self->page_size) - 1;
- self->fd0 = hmm_open(0);
+ self->fd0 = hmm_open(variant->device_number0);
+ if (self->fd0 < 0 && hmm_is_coherent_type(variant->device_number0))
+ SKIP(exit(0), "DEVICE_COHERENT not available");
ASSERT_GE(self->fd0, 0);
- self->fd1 = hmm_open(1);
+ self->fd1 = hmm_open(variant->device_number1);
ASSERT_GE(self->fd1, 0);
}
@@ -144,6 +197,7 @@ static int hmm_dmirror_cmd(int fd,
}
buffer->cpages = cmd.cpages;
buffer->faults = cmd.faults;
+ buffer->zone_device_type = cmd.zone_device_type;
return 0;
}
@@ -211,6 +265,20 @@ static void hmm_nanosleep(unsigned int n)
nanosleep(&t, NULL);
}
+static int hmm_migrate_sys_to_dev(int fd,
+ struct hmm_buffer *buffer,
+ unsigned long npages)
+{
+ return hmm_dmirror_cmd(fd, HMM_DMIRROR_MIGRATE_TO_DEV, buffer, npages);
+}
+
+static int hmm_migrate_dev_to_sys(int fd,
+ struct hmm_buffer *buffer,
+ unsigned long npages)
+{
+ return hmm_dmirror_cmd(fd, HMM_DMIRROR_MIGRATE_TO_SYS, buffer, npages);
+}
+
/*
* Simple NULL test of device open/close.
*/
@@ -875,7 +943,7 @@ TEST_F(hmm, migrate)
ptr[i] = i;
/* Migrate memory to device. */
- ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_MIGRATE, buffer, npages);
+ ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages);
ASSERT_EQ(ret, 0);
ASSERT_EQ(buffer->cpages, npages);
@@ -923,7 +991,7 @@ TEST_F(hmm, migrate_fault)
ptr[i] = i;
/* Migrate memory to device. */
- ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_MIGRATE, buffer, npages);
+ ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages);
ASSERT_EQ(ret, 0);
ASSERT_EQ(buffer->cpages, npages);
@@ -936,7 +1004,7 @@ TEST_F(hmm, migrate_fault)
ASSERT_EQ(ptr[i], i);
/* Migrate memory to the device again. */
- ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_MIGRATE, buffer, npages);
+ ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages);
ASSERT_EQ(ret, 0);
ASSERT_EQ(buffer->cpages, npages);
@@ -976,7 +1044,7 @@ TEST_F(hmm, migrate_shared)
ASSERT_NE(buffer->ptr, MAP_FAILED);
/* Migrate memory to device. */
- ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_MIGRATE, buffer, npages);
+ ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages);
ASSERT_EQ(ret, -ENOENT);
hmm_buffer_free(buffer);
@@ -1015,7 +1083,7 @@ TEST_F(hmm2, migrate_mixed)
p = buffer->ptr;
/* Migrating a protected area should be an error. */
- ret = hmm_dmirror_cmd(self->fd1, HMM_DMIRROR_MIGRATE, buffer, npages);
+ ret = hmm_migrate_sys_to_dev(self->fd1, buffer, npages);
ASSERT_EQ(ret, -EINVAL);
/* Punch a hole after the first page address. */
@@ -1023,7 +1091,7 @@ TEST_F(hmm2, migrate_mixed)
ASSERT_EQ(ret, 0);
/* We expect an error if the vma doesn't cover the range. */
- ret = hmm_dmirror_cmd(self->fd1, HMM_DMIRROR_MIGRATE, buffer, 3);
+ ret = hmm_migrate_sys_to_dev(self->fd1, buffer, 3);
ASSERT_EQ(ret, -EINVAL);
/* Page 2 will be a read-only zero page. */
@@ -1055,13 +1123,13 @@ TEST_F(hmm2, migrate_mixed)
/* Now try to migrate pages 2-5 to device 1. */
buffer->ptr = p + 2 * self->page_size;
- ret = hmm_dmirror_cmd(self->fd1, HMM_DMIRROR_MIGRATE, buffer, 4);
+ ret = hmm_migrate_sys_to_dev(self->fd1, buffer, 4);
ASSERT_EQ(ret, 0);
ASSERT_EQ(buffer->cpages, 4);
/* Page 5 won't be migrated to device 0 because it's on device 1. */
buffer->ptr = p + 5 * self->page_size;
- ret = hmm_dmirror_cmd(self->fd0, HMM_DMIRROR_MIGRATE, buffer, 1);
+ ret = hmm_migrate_sys_to_dev(self->fd0, buffer, 1);
ASSERT_EQ(ret, -ENOENT);
buffer->ptr = p;
@@ -1070,8 +1138,12 @@ TEST_F(hmm2, migrate_mixed)
}
/*
- * Migrate anonymous memory to device private memory and fault it back to system
- * memory multiple times.
+ * Migrate anonymous memory to device memory and back to system memory
+ * multiple times. In case of private zone configuration, this is done
+ * through fault pages accessed by CPU. In case of coherent zone configuration,
+ * the pages from the device should be explicitly migrated back to system memory.
+ * The reason is Coherent device zone has coherent access by CPU, therefore
+ * it will not generate any page fault.
*/
TEST_F(hmm, migrate_multiple)
{
@@ -1107,8 +1179,7 @@ TEST_F(hmm, migrate_multiple)
ptr[i] = i;
/* Migrate memory to device. */
- ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_MIGRATE, buffer,
- npages);
+ ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages);
ASSERT_EQ(ret, 0);
ASSERT_EQ(buffer->cpages, npages);
@@ -1116,7 +1187,13 @@ TEST_F(hmm, migrate_multiple)
for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
ASSERT_EQ(ptr[i], i);
- /* Fault pages back to system memory and check them. */
+ /* Migrate back to system memory and check them. */
+ if (hmm_is_coherent_type(variant->device_number)) {
+ ret = hmm_migrate_dev_to_sys(self->fd, buffer, npages);
+ ASSERT_EQ(ret, 0);
+ ASSERT_EQ(buffer->cpages, npages);
+ }
+
for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
ASSERT_EQ(ptr[i], i);
@@ -1354,13 +1431,13 @@ TEST_F(hmm2, snapshot)
/* Page 5 will be migrated to device 0. */
buffer->ptr = p + 5 * self->page_size;
- ret = hmm_dmirror_cmd(self->fd0, HMM_DMIRROR_MIGRATE, buffer, 1);
+ ret = hmm_migrate_sys_to_dev(self->fd0, buffer, 1);
ASSERT_EQ(ret, 0);
ASSERT_EQ(buffer->cpages, 1);
/* Page 6 will be migrated to device 1. */
buffer->ptr = p + 6 * self->page_size;
- ret = hmm_dmirror_cmd(self->fd1, HMM_DMIRROR_MIGRATE, buffer, 1);
+ ret = hmm_migrate_sys_to_dev(self->fd1, buffer, 1);
ASSERT_EQ(ret, 0);
ASSERT_EQ(buffer->cpages, 1);
@@ -1377,9 +1454,16 @@ TEST_F(hmm2, snapshot)
ASSERT_EQ(m[2], HMM_DMIRROR_PROT_ZERO | HMM_DMIRROR_PROT_READ);
ASSERT_EQ(m[3], HMM_DMIRROR_PROT_READ);
ASSERT_EQ(m[4], HMM_DMIRROR_PROT_WRITE);
- ASSERT_EQ(m[5], HMM_DMIRROR_PROT_DEV_PRIVATE_LOCAL |
- HMM_DMIRROR_PROT_WRITE);
- ASSERT_EQ(m[6], HMM_DMIRROR_PROT_NONE);
+ if (!hmm_is_coherent_type(variant->device_number0)) {
+ ASSERT_EQ(m[5], HMM_DMIRROR_PROT_DEV_PRIVATE_LOCAL |
+ HMM_DMIRROR_PROT_WRITE);
+ ASSERT_EQ(m[6], HMM_DMIRROR_PROT_NONE);
+ } else {
+ ASSERT_EQ(m[5], HMM_DMIRROR_PROT_DEV_COHERENT_LOCAL |
+ HMM_DMIRROR_PROT_WRITE);
+ ASSERT_EQ(m[6], HMM_DMIRROR_PROT_DEV_COHERENT_REMOTE |
+ HMM_DMIRROR_PROT_WRITE);
+ }
hmm_buffer_free(buffer);
}
@@ -1685,4 +1769,82 @@ TEST_F(hmm, exclusive_cow)
hmm_buffer_free(buffer);
}
+/*
+ * Test get user device pages through gup_test. Setting PIN_LONGTERM flag.
+ * This should trigger a migration back to system memory for both, private
+ * and coherent type pages.
+ * This test makes use of gup_test module. Make sure GUP_TEST_CONFIG is added
+ * to your configuration before you run it.
+ */
+TEST_F(hmm, hmm_gup_test)
+{
+ struct hmm_buffer *buffer;
+ struct gup_test gup;
+ int gup_fd;
+ unsigned long npages;
+ unsigned long size;
+ unsigned long i;
+ int *ptr;
+ int ret;
+ unsigned char *m;
+
+ gup_fd = open("/sys/kernel/debug/gup_test", O_RDWR);
+ if (gup_fd == -1)
+ SKIP(return, "Skipping test, could not find gup_test driver");
+
+ npages = 4;
+ ASSERT_NE(npages, 0);
+ size = npages << self->page_shift;
+
+ buffer = malloc(sizeof(*buffer));
+ ASSERT_NE(buffer, NULL);
+
+ buffer->fd = -1;
+ buffer->size = size;
+ buffer->mirror = malloc(size);
+ ASSERT_NE(buffer->mirror, NULL);
+
+ buffer->ptr = mmap(NULL, size,
+ PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS,
+ buffer->fd, 0);
+ ASSERT_NE(buffer->ptr, MAP_FAILED);
+
+ /* Initialize buffer in system memory. */
+ for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+ ptr[i] = i;
+
+ /* Migrate memory to device. */
+ ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages);
+ ASSERT_EQ(ret, 0);
+ ASSERT_EQ(buffer->cpages, npages);
+ /* Check what the device read. */
+ for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
+ ASSERT_EQ(ptr[i], i);
+
+ gup.nr_pages_per_call = npages;
+ gup.addr = (unsigned long)buffer->ptr;
+ gup.gup_flags = FOLL_WRITE;
+ gup.size = size;
+ /*
+ * Calling gup_test ioctl. It will try to PIN_LONGTERM these device pages
+ * causing a migration back to system memory for both, private and coherent
+ * type pages.
+ */
+ if (ioctl(gup_fd, PIN_LONGTERM_BENCHMARK, &gup)) {
+ perror("ioctl on PIN_LONGTERM_BENCHMARK\n");
+ goto out_test;
+ }
+
+ /* Take snapshot to make sure pages have been migrated to sys memory */
+ ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_SNAPSHOT, buffer, npages);
+ ASSERT_EQ(ret, 0);
+ ASSERT_EQ(buffer->cpages, npages);
+ m = buffer->mirror;
+ for (i = 0; i < npages; i++)
+ ASSERT_EQ(m[i], HMM_DMIRROR_PROT_WRITE);
+out_test:
+ close(gup_fd);
+ hmm_buffer_free(buffer);
+}
TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/vm/hugepage-mremap.c b/tools/testing/selftests/vm/hugepage-mremap.c
index 2a7c33631a298..1d689084a54ba 100644
--- a/tools/testing/selftests/vm/hugepage-mremap.c
+++ b/tools/testing/selftests/vm/hugepage-mremap.c
@@ -3,9 +3,10 @@
* hugepage-mremap:
*
* Example of remapping huge page memory in a user application using the
- * mremap system call. Code assumes a hugetlbfs filesystem is mounted
- * at './huge'. The amount of memory used by this test is decided by a command
- * line argument in MBs. If missing, the default amount is 10MB.
+ * mremap system call. The path to a file in a hugetlbfs filesystem must
+ * be passed as the last argument to this test. The amount of memory used
+ * by this test in MBs can optionally be passed as an argument. If no memory
+ * amount is passed, the default amount is 10MB.
*
* To make sure the test triggers pmd sharing and goes through the 'unshare'
* path in the mremap code use 1GB (1024) or more.
@@ -25,7 +26,6 @@
#define DEFAULT_LENGTH_MB 10UL
#define MB_TO_BYTES(x) (x * 1024 * 1024)
-#define FILE_NAME "huge/hugepagefile"
#define PROTECTION (PROT_READ | PROT_WRITE | PROT_EXEC)
#define FLAGS (MAP_SHARED | MAP_ANONYMOUS)
@@ -107,17 +107,26 @@ static void register_region_with_uffd(char *addr, size_t len)
int main(int argc, char *argv[])
{
+ size_t length;
+
+ if (argc != 2 && argc != 3) {
+ printf("Usage: %s [length_in_MB] <hugetlb_file>\n", argv[0]);
+ exit(1);
+ }
+
/* Read memory length as the first arg if valid, otherwise fallback to
- * the default length. Any additional args are ignored.
+ * the default length.
*/
- size_t length = argc > 1 ? (size_t)atoi(argv[1]) : 0UL;
+ if (argc == 3)
+ length = argc > 2 ? (size_t)atoi(argv[1]) : 0UL;
length = length > 0 ? length : DEFAULT_LENGTH_MB;
length = MB_TO_BYTES(length);
int ret = 0;
- int fd = open(FILE_NAME, O_CREAT | O_RDWR, 0755);
+ /* last arg is the hugetlb file name */
+ int fd = open(argv[argc-1], O_CREAT | O_RDWR, 0755);
if (fd < 0) {
perror("Open failed");
@@ -169,5 +178,8 @@ int main(int argc, char *argv[])
munmap(addr, length);
+ close(fd);
+ unlink(argv[argc-1]);
+
return ret;
}
diff --git a/tools/testing/selftests/vm/hugepage-vmemmap.c b/tools/testing/selftests/vm/hugepage-vmemmap.c
new file mode 100644
index 0000000000000..557bdbd4f87e8
--- /dev/null
+++ b/tools/testing/selftests/vm/hugepage-vmemmap.c
@@ -0,0 +1,144 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * A test case of using hugepage memory in a user application using the
+ * mmap system call with MAP_HUGETLB flag. Before running this program
+ * make sure the administrator has allocated enough default sized huge
+ * pages to cover the 2 MB allocation.
+ */
+#include <stdlib.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/mman.h>
+#include <fcntl.h>
+
+#define MAP_LENGTH (2UL * 1024 * 1024)
+
+#ifndef MAP_HUGETLB
+#define MAP_HUGETLB 0x40000 /* arch specific */
+#endif
+
+#define PAGE_SIZE 4096
+
+#define PAGE_COMPOUND_HEAD (1UL << 15)
+#define PAGE_COMPOUND_TAIL (1UL << 16)
+#define PAGE_HUGE (1UL << 17)
+
+#define HEAD_PAGE_FLAGS (PAGE_COMPOUND_HEAD | PAGE_HUGE)
+#define TAIL_PAGE_FLAGS (PAGE_COMPOUND_TAIL | PAGE_HUGE)
+
+#define PM_PFRAME_BITS 55
+#define PM_PFRAME_MASK ~((1UL << PM_PFRAME_BITS) - 1)
+
+/*
+ * For ia64 architecture, Linux kernel reserves Region number 4 for hugepages.
+ * That means the addresses starting with 0x800000... will need to be
+ * specified. Specifying a fixed address is not required on ppc64, i386
+ * or x86_64.
+ */
+#ifdef __ia64__
+#define MAP_ADDR (void *)(0x8000000000000000UL)
+#define MAP_FLAGS (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | MAP_FIXED)
+#else
+#define MAP_ADDR NULL
+#define MAP_FLAGS (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB)
+#endif
+
+static void write_bytes(char *addr, size_t length)
+{
+ unsigned long i;
+
+ for (i = 0; i < length; i++)
+ *(addr + i) = (char)i;
+}
+
+static unsigned long virt_to_pfn(void *addr)
+{
+ int fd;
+ unsigned long pagemap;
+
+ fd = open("/proc/self/pagemap", O_RDONLY);
+ if (fd < 0)
+ return -1UL;
+
+ lseek(fd, (unsigned long)addr / PAGE_SIZE * sizeof(pagemap), SEEK_SET);
+ read(fd, &pagemap, sizeof(pagemap));
+ close(fd);
+
+ return pagemap & ~PM_PFRAME_MASK;
+}
+
+static int check_page_flags(unsigned long pfn)
+{
+ int fd, i;
+ unsigned long pageflags;
+
+ fd = open("/proc/kpageflags", O_RDONLY);
+ if (fd < 0)
+ return -1;
+
+ lseek(fd, pfn * sizeof(pageflags), SEEK_SET);
+
+ read(fd, &pageflags, sizeof(pageflags));
+ if ((pageflags & HEAD_PAGE_FLAGS) != HEAD_PAGE_FLAGS) {
+ close(fd);
+ printf("Head page flags (%lx) is invalid\n", pageflags);
+ return -1;
+ }
+
+ /*
+ * pages other than the first page must be tail and shouldn't be head;
+ * this also verifies kernel has correctly set the fake page_head to tail
+ * while hugetlb_free_vmemmap is enabled.
+ */
+ for (i = 1; i < MAP_LENGTH / PAGE_SIZE; i++) {
+ read(fd, &pageflags, sizeof(pageflags));
+ if ((pageflags & TAIL_PAGE_FLAGS) != TAIL_PAGE_FLAGS ||
+ (pageflags & HEAD_PAGE_FLAGS) == HEAD_PAGE_FLAGS) {
+ close(fd);
+ printf("Tail page flags (%lx) is invalid\n", pageflags);
+ return -1;
+ }
+ }
+
+ close(fd);
+
+ return 0;
+}
+
+int main(int argc, char **argv)
+{
+ void *addr;
+ unsigned long pfn;
+
+ addr = mmap(MAP_ADDR, MAP_LENGTH, PROT_READ | PROT_WRITE, MAP_FLAGS, -1, 0);
+ if (addr == MAP_FAILED) {
+ perror("mmap");
+ exit(1);
+ }
+
+ /* Trigger allocation of HugeTLB page. */
+ write_bytes(addr, MAP_LENGTH);
+
+ pfn = virt_to_pfn(addr);
+ if (pfn == -1UL) {
+ munmap(addr, MAP_LENGTH);
+ perror("virt_to_pfn");
+ exit(1);
+ }
+
+ printf("Returned address is %p whose pfn is %lx\n", addr, pfn);
+
+ if (check_page_flags(pfn) < 0) {
+ munmap(addr, MAP_LENGTH);
+ perror("check_page_flags");
+ exit(1);
+ }
+
+ /* munmap() length of MAP_HUGETLB memory must be hugepage aligned */
+ if (munmap(addr, MAP_LENGTH)) {
+ perror("munmap");
+ exit(1);
+ }
+
+ return 0;
+}
diff --git a/tools/testing/selftests/vm/memfd_secret.c b/tools/testing/selftests/vm/memfd_secret.c
index 93e7e7ffed337..957b9e18c7295 100644
--- a/tools/testing/selftests/vm/memfd_secret.c
+++ b/tools/testing/selftests/vm/memfd_secret.c
@@ -282,7 +282,7 @@ int main(int argc, char *argv[])
close(fd);
- ksft_exit(!ksft_get_fail_cnt());
+ ksft_finished();
}
#else /* __NR_memfd_secret */
diff --git a/tools/testing/selftests/vm/run_vmtests.sh b/tools/testing/selftests/vm/run_vmtests.sh
index 75d4017413944..e10d50e0b8e83 100755
--- a/tools/testing/selftests/vm/run_vmtests.sh
+++ b/tools/testing/selftests/vm/run_vmtests.sh
@@ -111,7 +111,19 @@ fi
echo "-----------------------"
echo "running hugepage-mremap"
echo "-----------------------"
-./hugepage-mremap 256
+./hugepage-mremap $mnt/huge_mremap
+if [ $? -ne 0 ]; then
+ echo "[FAIL]"
+ exitcode=1
+else
+ echo "[PASS]"
+fi
+rm -f $mnt/huge_mremap
+
+echo "------------------------"
+echo "running hugepage-vmemmap"
+echo "------------------------"
+./hugepage-vmemmap
if [ $? -ne 0 ]; then
echo "[FAIL]"
exitcode=1
diff --git a/tools/testing/selftests/vm/test_hmm.sh b/tools/testing/selftests/vm/test_hmm.sh
index 0647b525a6256..539c9371e592a 100755
--- a/tools/testing/selftests/vm/test_hmm.sh
+++ b/tools/testing/selftests/vm/test_hmm.sh
@@ -40,11 +40,26 @@ check_test_requirements()
load_driver()
{
- modprobe $DRIVER > /dev/null 2>&1
+ if [ $# -eq 0 ]; then
+ modprobe $DRIVER > /dev/null 2>&1
+ else
+ if [ $# -eq 2 ]; then
+ modprobe $DRIVER spm_addr_dev0=$1 spm_addr_dev1=$2
+ > /dev/null 2>&1
+ else
+ echo "Missing module parameters. Make sure pass"\
+ "spm_addr_dev0 and spm_addr_dev1"
+ usage
+ fi
+ fi
if [ $? == 0 ]; then
major=$(awk "\$2==\"HMM_DMIRROR\" {print \$1}" /proc/devices)
mknod /dev/hmm_dmirror0 c $major 0
mknod /dev/hmm_dmirror1 c $major 1
+ if [ $# -eq 2 ]; then
+ mknod /dev/hmm_dmirror2 c $major 2
+ mknod /dev/hmm_dmirror3 c $major 3
+ fi
fi
}
@@ -58,7 +73,7 @@ run_smoke()
{
echo "Running smoke test. Note, this test provides basic coverage."
- load_driver
+ load_driver $1 $2
$(dirname "${BASH_SOURCE[0]}")/hmm-tests
unload_driver
}
@@ -75,6 +90,9 @@ usage()
echo "# Smoke testing"
echo "./${TEST_NAME}.sh smoke"
echo
+ echo "# Smoke testing with SPM enabled"
+ echo "./${TEST_NAME}.sh smoke <spm_addr_dev0> <spm_addr_dev1>"
+ echo
exit 0
}
@@ -84,7 +102,7 @@ function run_test()
usage
else
if [ "$1" = "smoke" ]; then
- run_smoke
+ run_smoke $2 $3
else
usage
fi
diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c
index 2f49c9af1b582..50476ec6c0704 100644
--- a/tools/testing/selftests/vm/userfaultfd.c
+++ b/tools/testing/selftests/vm/userfaultfd.c
@@ -119,6 +119,9 @@ struct uffd_stats {
~(unsigned long)(sizeof(unsigned long long) \
- 1)))
+#define swap(a, b) \
+ do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0)
+
const char *examples =
"# Run anonymous memory test on 100MiB region with 99999 bounces:\n"
"./userfaultfd anon 100 99999\n\n"
diff --git a/tools/vm/page_owner_sort.c b/tools/vm/page_owner_sort.c
index 9ebb84a9c7310..c8ec2d6b314de 100644
--- a/tools/vm/page_owner_sort.c
+++ b/tools/vm/page_owner_sort.c
@@ -20,22 +20,27 @@
#include <string.h>
#include <regex.h>
#include <errno.h>
+#include <linux/types.h>
struct block_list {
char *txt;
+ char *stacktrace;
int len;
int num;
int page_num;
+ pid_t pid;
+ __u64 ts_nsec;
+ __u64 free_ts_nsec;
};
-static int sort_by_memory;
static regex_t order_pattern;
+static regex_t pid_pattern;
+static regex_t ts_nsec_pattern;
+static regex_t free_ts_nsec_pattern;
static struct block_list *list;
static int list_size;
static int max_size;
-struct block_list *block_head;
-
int read_block(char *buf, int buf_size, FILE *fin)
{
char *curr = buf, *const buf_end = buf + buf_size;
@@ -58,6 +63,13 @@ static int compare_txt(const void *p1, const void *p2)
return strcmp(l1->txt, l2->txt);
}
+static int compare_stacktrace(const void *p1, const void *p2)
+{
+ const struct block_list *l1 = p1, *l2 = p2;
+
+ return strcmp(l1->stacktrace, l2->stacktrace);
+}
+
static int compare_num(const void *p1, const void *p2)
{
const struct block_list *l1 = p1, *l2 = p2;
@@ -72,34 +84,124 @@ static int compare_page_num(const void *p1, const void *p2)
return l2->page_num - l1->page_num;
}
-static int get_page_num(char *buf)
+static int compare_pid(const void *p1, const void *p2)
{
- int err, val_len, order_val;
- char order_str[4] = {0};
- char *endptr;
+ const struct block_list *l1 = p1, *l2 = p2;
+
+ return l1->pid - l2->pid;
+}
+
+static int compare_ts(const void *p1, const void *p2)
+{
+ const struct block_list *l1 = p1, *l2 = p2;
+
+ return l1->ts_nsec < l2->ts_nsec ? -1 : 1;
+}
+
+static int compare_free_ts(const void *p1, const void *p2)
+{
+ const struct block_list *l1 = p1, *l2 = p2;
+
+ return l1->free_ts_nsec < l2->free_ts_nsec ? -1 : 1;
+}
+
+static int search_pattern(regex_t *pattern, char *pattern_str, char *buf)
+{
+ int err, val_len;
regmatch_t pmatch[2];
- err = regexec(&order_pattern, buf, 2, pmatch, REG_NOTBOL);
+ err = regexec(pattern, buf, 2, pmatch, REG_NOTBOL);
if (err != 0 || pmatch[1].rm_so == -1) {
- printf("no order pattern in %s\n", buf);
- return 0;
+ printf("no matching pattern in %s\n", buf);
+ return -1;
}
val_len = pmatch[1].rm_eo - pmatch[1].rm_so;
- if (val_len > 2) /* max_order should not exceed 2 digits */
- goto wrong_order;
- memcpy(order_str, buf + pmatch[1].rm_so, val_len);
+ memcpy(pattern_str, buf + pmatch[1].rm_so, val_len);
+
+ return 0;
+}
+static void check_regcomp(regex_t *pattern, const char *regex)
+{
+ int err;
+
+ err = regcomp(pattern, regex, REG_EXTENDED | REG_NEWLINE);
+ if (err != 0 || pattern->re_nsub != 1) {
+ printf("Invalid pattern %s code %d\n", regex, err);
+ exit(1);
+ }
+}
+
+# define FIELD_BUFF 25
+
+static int get_page_num(char *buf)
+{
+ int order_val;
+ char order_str[FIELD_BUFF] = {0};
+ char *endptr;
+
+ search_pattern(&order_pattern, order_str, buf);
errno = 0;
order_val = strtol(order_str, &endptr, 10);
- if (errno != 0 || endptr == order_str || *endptr != '\0')
- goto wrong_order;
+ if (order_val > 64 || errno != 0 || endptr == order_str || *endptr != '\0') {
+ printf("wrong order in follow buf:\n%s\n", buf);
+ return 0;
+ }
return 1 << order_val;
+}
+
+static pid_t get_pid(char *buf)
+{
+ pid_t pid;
+ char pid_str[FIELD_BUFF] = {0};
+ char *endptr;
+
+ search_pattern(&pid_pattern, pid_str, buf);
+ errno = 0;
+ pid = strtol(pid_str, &endptr, 10);
+ if (errno != 0 || endptr == pid_str || *endptr != '\0') {
+ printf("wrong/invalid pid in follow buf:\n%s\n", buf);
+ return -1;
+ }
+
+ return pid;
-wrong_order:
- printf("wrong order in follow buf:\n%s\n", buf);
- return 0;
+}
+
+static __u64 get_ts_nsec(char *buf)
+{
+ __u64 ts_nsec;
+ char ts_nsec_str[FIELD_BUFF] = {0};
+ char *endptr;
+
+ search_pattern(&ts_nsec_pattern, ts_nsec_str, buf);
+ errno = 0;
+ ts_nsec = strtoull(ts_nsec_str, &endptr, 10);
+ if (errno != 0 || endptr == ts_nsec_str || *endptr != '\0') {
+ printf("wrong ts_nsec in follow buf:\n%s\n", buf);
+ return -1;
+ }
+
+ return ts_nsec;
+}
+
+static __u64 get_free_ts_nsec(char *buf)
+{
+ __u64 free_ts_nsec;
+ char free_ts_nsec_str[FIELD_BUFF] = {0};
+ char *endptr;
+
+ search_pattern(&free_ts_nsec_pattern, free_ts_nsec_str, buf);
+ errno = 0;
+ free_ts_nsec = strtoull(free_ts_nsec_str, &endptr, 10);
+ if (errno != 0 || endptr == free_ts_nsec_str || *endptr != '\0') {
+ printf("wrong free_ts_nsec in follow buf:\n%s\n", buf);
+ return -1;
+ }
+
+ return free_ts_nsec;
}
static void add_list(char *buf, int len)
@@ -121,6 +223,10 @@ static void add_list(char *buf, int len)
list[list_size].page_num = get_page_num(buf);
memcpy(list[list_size].txt, buf, len);
list[list_size].txt[len] = 0;
+ list[list_size].stacktrace = strchr(list[list_size].txt, '\n') ?: "";
+ list[list_size].pid = get_pid(buf);
+ list[list_size].ts_nsec = get_ts_nsec(buf);
+ list[list_size].free_ts_nsec = get_free_ts_nsec(buf);
list_size++;
if (list_size % 1000 == 0) {
printf("loaded %d\r", list_size);
@@ -132,25 +238,55 @@ static void add_list(char *buf, int len)
static void usage(void)
{
- printf("Usage: ./page_owner_sort [-m] <input> <output>\n"
- "-m Sort by total memory. If this option is unset, sort by times\n"
+ printf("Usage: ./page_owner_sort [OPTIONS] <input> <output>\n"
+ "-m Sort by total memory.\n"
+ "-s Sort by the stack trace.\n"
+ "-t Sort by times (default).\n"
+ "-p Sort by pid.\n"
+ "-a Sort by memory allocate time.\n"
+ "-r Sort by memory release time.\n"
+ "-c Cull by comparing stacktrace instead of total block.\n"
+ "-f Filter out the information of blocks whose memory has not been released.\n"
);
}
int main(int argc, char **argv)
{
+ int (*cmp)(const void *, const void *) = compare_num;
+ int cull_st = 0;
+ int filter = 0;
FILE *fin, *fout;
char *buf;
int ret, i, count;
struct block_list *list2;
struct stat st;
- int err;
int opt;
- while ((opt = getopt(argc, argv, "m")) != -1)
+ while ((opt = getopt(argc, argv, "acfmprst")) != -1)
switch (opt) {
+ case 'a':
+ cmp = compare_ts;
+ break;
+ case 'c':
+ cull_st = 1;
+ break;
+ case 'f':
+ filter = 1;
+ break;
case 'm':
- sort_by_memory = 1;
+ cmp = compare_page_num;
+ break;
+ case 'p':
+ cmp = compare_pid;
+ break;
+ case 'r':
+ cmp = compare_free_ts;
+ break;
+ case 's':
+ cmp = compare_stacktrace;
+ break;
+ case 't':
+ cmp = compare_num;
break;
default:
usage();
@@ -170,13 +306,10 @@ int main(int argc, char **argv)
exit(1);
}
- err = regcomp(&order_pattern, "order\\s*([0-9]*),", REG_EXTENDED|REG_NEWLINE);
- if (err != 0 || order_pattern.re_nsub != 1) {
- printf("%s: Invalid pattern 'order\\s*([0-9]*),' code %d\n",
- argv[0], err);
- exit(1);
- }
-
+ check_regcomp(&order_pattern, "order\\s*([0-9]*),");
+ check_regcomp(&pid_pattern, "pid\\s*([0-9]*),");
+ check_regcomp(&ts_nsec_pattern, "ts\\s*([0-9]*)\\s*ns,");
+ check_regcomp(&free_ts_nsec_pattern, "free_ts\\s*([0-9]*)\\s*ns");
fstat(fileno(fin), &st);
max_size = st.st_size / 100; /* hack ... */
@@ -199,7 +332,10 @@ int main(int argc, char **argv)
printf("sorting ....\n");
- qsort(list, list_size, sizeof(list[0]), compare_txt);
+ if (cull_st == 1)
+ qsort(list, list_size, sizeof(list[0]), compare_stacktrace);
+ else
+ qsort(list, list_size, sizeof(list[0]), compare_txt);
list2 = malloc(sizeof(*list) * list_size);
if (!list2) {
@@ -209,9 +345,11 @@ int main(int argc, char **argv)
printf("culling\n");
+ long offset = cull_st ? &list[0].stacktrace - &list[0].txt : 0;
+
for (i = count = 0; i < list_size; i++) {
if (count == 0 ||
- strcmp(list2[count-1].txt, list[i].txt) != 0) {
+ strcmp(*(&list2[count-1].txt+offset), *(&list[i].txt+offset)) != 0) {
list2[count++] = list[i];
} else {
list2[count-1].num += list[i].num;
@@ -219,15 +357,17 @@ int main(int argc, char **argv)
}
}
- if (sort_by_memory)
- qsort(list2, count, sizeof(list[0]), compare_page_num);
- else
- qsort(list2, count, sizeof(list[0]), compare_num);
+ qsort(list2, count, sizeof(list[0]), cmp);
- for (i = 0; i < count; i++)
+ for (i = 0; i < count; i++) {
+ if (filter == 1 && list2[i].free_ts_nsec != 0)
+ continue;
fprintf(fout, "%d times, %d pages:\n%s\n",
list2[i].num, list2[i].page_num, list2[i].txt);
-
+ }
regfree(&order_pattern);
+ regfree(&pid_pattern);
+ regfree(&ts_nsec_pattern);
+ regfree(&free_ts_nsec_pattern);
return 0;
}