sandikata · October 3, 2022 08:26
diff --git a/cachyos-30sep-03oct.diff b/cachyos-30sep-03oct.diff
 --- 6.0-cachyos-base-all.patch.old	2022-10-03 10:53:04.991120773 +0300
 +++ 6.0-cachyos-base-all.patch	2022-10-03 09:57:46.659670708 +0300
 @@ -1,7 +1,7 @@
 -From 4ee5774d519ab3d21a214f4aa94e3f2ddc6ceb81 Mon Sep 17 00:00:00 2001
 +From 2fa4f73d2e50a4a2c2c2873f08ac131c10717317 Mon Sep 17 00:00:00 2001
 From: Peter Jung <[email protected]>
 -Date: Tue, 27 Sep 2022 15:12:20 +0200
 -Subject: [PATCH 01/16] cachy
 +Date: Sun, 2 Oct 2022 23:51:09 +0200
 +Subject: [PATCH 01/17] cachy
 
 Signed-off-by: Peter Jung <[email protected]>
 ---
 @@ -71,11 +71,12 @@
  include/linux/user_namespace.h              |    4 +
  include/linux/wait.h                        |    2 +
  include/uapi/linux/if_bonding.h             |    2 +-
 - init/Kconfig                                |   26 +
 + init/Kconfig                                |   39 +
  init/do_mounts.c                            |   16 +-
  kernel/Kconfig.hz                           |   24 +
  kernel/fork.c                               |   14 +
  kernel/locking/rwsem.c                      |    4 +-
 + kernel/module/Kconfig                       |   25 +
  kernel/module/internal.h                    |    2 +
  kernel/module/main.c                        |    1 +
  kernel/module/procfs.c                      |   13 +
 @@ -91,8 +92,8 @@
  lib/raid6/algos.c                           |    4 +-
  lib/string.c                                |   62 +-
  lib/zstd/Makefile                           |   16 +-
 - lib/zstd/common/entropy_common.c            |    4 +-
 - lib/zstd/common/zstd_common.c               |    7 +
 + lib/zstd/common/entropy_common.c            |    5 +-
 + lib/zstd/common/zstd_common.c               |   10 +
  lib/zstd/compress/zstd_double_fast.c        |   61 +-
  lib/zstd/compress/zstd_fast.c               |   69 +-
  lib/zstd/compress/zstd_lazy.c               |  223 ++---
 @@ -106,7 +107,9 @@
  mm/vmscan.c                                 |    4 +
  net/ipv4/inet_connection_sock.c             |    2 +-
  net/ipv4/tcp.c                              |    4 +-
 - 101 files changed, 2400 insertions(+), 349 deletions(-)
 + scripts/Makefile.lib                        |   13 +-
 + scripts/Makefile.modinst                    |    7 +-
 + 104 files changed, 2458 insertions(+), 353 deletions(-)
  create mode 100644 arch/x86/Makefile.postlink
 
 diff --git a/.gitignore b/.gitignore
 @@ -152,7 +155,7 @@
  ``/sys/devices/system/cpu/cpuX/acpi_cppc/``, see :ref:`cppc_sysfs`.
  
 diff --git a/Makefile b/Makefile
 -index 647a42a1f800..5c327c29ef12 100644
 +index 8478e13e9424..30320363622c 100644
 --- a/Makefile
 +++ b/Makefile
 @@ -758,6 +758,8 @@ KBUILD_CFLAGS	+= $(call cc-disable-warning, address-of-packed-member)
 @@ -1011,7 +1014,7 @@
  #define MODULE_PROC_FAMILY "ELAN "
  #elif defined CONFIG_MCRUSOE
 diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
 -index 62f6b8b7c4a5..f9c9b5850847 100644
 +index 4f3204364caa..097a6cfad8b4 100644
 --- a/arch/x86/kernel/alternative.c
 +++ b/arch/x86/kernel/alternative.c
 @@ -936,7 +936,9 @@ void __init alternative_instructions(void)
 @@ -1304,7 +1307,7 @@
  #endif /* CONFIG_BFQ_CGROUP_DEBUG */
  
 diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
 -index c740b41fe0a4..5ea6245f0208 100644
 +index c740b41fe0a4..adf6cd94fd4a 100644
 --- a/block/bfq-iosched.c
 +++ b/block/bfq-iosched.c
 @@ -1925,7 +1925,7 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd,
 @@ -1367,7 +1370,7 @@
  static int __init bfq_init(void)
  {
  	int ret;
 -+	char msg[60] = "BFQ I/O-scheduler: BFQ-CachyOS v5.19";
 ++	char msg[60] = "BFQ I/O-scheduler: BFQ-CachyOS v6.0.0";
  
  #ifdef CONFIG_BFQ_GROUP_IOSCHED
  	ret = blkcg_policy_register(&blkcg_policy_bfq);
 @@ -3334,7 +3337,7 @@
  /* fake multicast ability */
  static void set_multicast_list(struct net_device *dev)
 diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
 -index 66446f1e06cf..c65b03f91ecf 100644
 +index 8d5a7ae19844..56d1780d1337 100644
 --- a/drivers/nvme/host/core.c
 +++ b/drivers/nvme/host/core.c
 @@ -58,7 +58,7 @@ static u8 nvme_max_retries = 5;
 @@ -3606,7 +3609,7 @@
  #define BOND_DEFAULT_TX_QUEUES 16   /* Default number of tx queues per device */
  
 diff --git a/init/Kconfig b/init/Kconfig
 -index 532362fcfe31..442a945ca6ae 100644
 +index 532362fcfe31..f5bd72b39352 100644
 --- a/init/Kconfig
 +++ b/init/Kconfig
 @@ -112,6 +112,10 @@ config THREAD_INFO_IN_TASK
 @@ -3620,7 +3623,27 @@
  config BROKEN
  	bool
  
 -@@ -1241,6 +1245,22 @@ config USER_NS
 +@@ -334,6 +338,19 @@ config KERNEL_UNCOMPRESSED
 + 
 + endchoice
 + 
 ++menu "ZSTD compression options"
 ++	depends on KERNEL_ZSTD
 ++
 ++config ZSTD_COMP_VAL
 ++	int "Compression level (1-22)"
 ++	range 1 22
 ++	default "22"
 ++	help
 ++	  Choose a compression level for zstd kernel compression.
 ++	  Default is 22, which is the maximum.
 ++
 ++endmenu
 ++
 + config DEFAULT_INIT
 + 	string "Default init path"
 + 	default ""
 +@@ -1241,6 +1258,22 @@ config USER_NS
  
  	  If unsure, say N.
  
 @@ -3643,7 +3666,7 @@
  config PID_NS
  	bool "PID Namespaces"
  	default y
 -@@ -1407,6 +1427,12 @@ config CC_OPTIMIZE_FOR_PERFORMANCE
 +@@ -1407,6 +1440,12 @@ config CC_OPTIMIZE_FOR_PERFORMANCE
  	  with the "-O2" compiler flag for best performance and most
  	  helpful compile-time warnings.
  
 @@ -3794,6 +3817,42 @@
  	}
  
  	return state;
 +diff --git a/kernel/module/Kconfig b/kernel/module/Kconfig
 +index 26ea5d04f56c..e5311101b93d 100644
 +--- a/kernel/module/Kconfig
 ++++ b/kernel/module/Kconfig
 +@@ -219,6 +219,31 @@ config MODULE_COMPRESS_ZSTD
 + 
 + endchoice
 + 
 ++menu "ZSTD module compression options"
 ++    depends on MODULE_COMPRESS_ZSTD
 ++
 ++config MODULE_COMPRESS_ZSTD_LEVEL
 ++	int "Compression level (1-19)"
 ++	range 1 19
 ++	default 9
 ++	help
 ++	  Compression level used by zstd for compressing modules.
 ++
 ++config MODULE_COMPRESS_ZSTD_ULTRA
 ++	bool "Enable ZSTD ultra compression"
 ++	help
 ++	  Compress modules with ZSTD using the highest possible compression.
 ++
 ++config MODULE_COMPRESS_ZSTD_LEVEL_ULTRA
 ++	int "Compression level (20-22)"
 ++	depends on MODULE_COMPRESS_ZSTD_ULTRA
 ++	range 20 22
 ++	default 20
 ++	help
 ++	  Ultra compression level used by zstd for compressing modules.
 ++
 ++endmenu
 ++
 + config MODULE_DECOMPRESS
 + 	bool "Support in-kernel module decompression"
 + 	depends on MODULE_COMPRESS_GZIP || MODULE_COMPRESS_XZ
 diff --git a/kernel/module/internal.h b/kernel/module/internal.h
 index 680d980a4fb2..8a3abfff9fe9 100644
 --- a/kernel/module/internal.h
 @@ -4255,7 +4314,7 @@
 -		decompress/zstd_decompress.o \
 -		decompress/zstd_decompress_block.o \
 diff --git a/lib/zstd/common/entropy_common.c b/lib/zstd/common/entropy_common.c
 -index 53b47a2b52ff..f84612627471 100644
 +index 53b47a2b52ff..a311808c0d56 100644
 --- a/lib/zstd/common/entropy_common.c
 +++ b/lib/zstd/common/entropy_common.c
 @@ -15,6 +15,7 @@
 @@ -4283,8 +4342,13 @@
  
  FORCE_INLINE_TEMPLATE size_t
  HUF_readStats_body(BYTE* huffWeight, size_t hwSize, U32* rankStats,
 +@@ -355,3 +357,4 @@ size_t HUF_readStats_wksp(BYTE* huffWeight, size_t hwSize, U32* rankStats,
 +     (void)bmi2;
 +     return HUF_readStats_body_default(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, workSpace, wkspSize);
 + }
 ++EXPORT_SYMBOL_GPL(HUF_readStats_wksp);
 diff --git a/lib/zstd/common/zstd_common.c b/lib/zstd/common/zstd_common.c
 -index 3d7e35b309b5..06f62b2026d5 100644
 +index 3d7e35b309b5..0f1f63be25d9 100644
 --- a/lib/zstd/common/zstd_common.c
 +++ b/lib/zstd/common/zstd_common.c
 @@ -13,6 +13,7 @@
 @@ -4295,7 +4359,25 @@
  #define ZSTD_DEPS_NEED_MALLOC
  #include "zstd_deps.h"   /* ZSTD_malloc, ZSTD_calloc, ZSTD_free, ZSTD_memset */
  #include "error_private.h"
 -@@ -59,6 +60,7 @@ void* ZSTD_customMalloc(size_t size, ZSTD_customMem customMem)
 +@@ -35,14 +36,17 @@ const char* ZSTD_versionString(void) { return ZSTD_VERSION_STRING; }
 +  *  tells if a return value is an error code
 +  *  symbol is required for external callers */
 + unsigned ZSTD_isError(size_t code) { return ERR_isError(code); }
 ++EXPORT_SYMBOL_GPL(ZSTD_isError);
 + 
 + /*! ZSTD_getErrorName() :
 +  *  provides error code string from function result (useful for debugging) */
 + const char* ZSTD_getErrorName(size_t code) { return ERR_getErrorName(code); }
 ++EXPORT_SYMBOL_GPL(ZSTD_getErrorName);
 + 
 + /*! ZSTD_getError() :
 +  *  convert a `size_t` function result into a proper ZSTD_errorCode enum */
 + ZSTD_ErrorCode ZSTD_getErrorCode(size_t code) { return ERR_getErrorCode(code); }
 ++EXPORT_SYMBOL_GPL(ZSTD_getErrorCode);
 + 
 + /*! ZSTD_getErrorString() :
 +  *  provides error code string from enum */
 +@@ -59,6 +63,7 @@ void* ZSTD_customMalloc(size_t size, ZSTD_customMem customMem)
          return customMem.customAlloc(customMem.opaque, size);
      return ZSTD_malloc(size);
  }
 @@ -4303,7 +4385,7 @@
  
  void* ZSTD_customCalloc(size_t size, ZSTD_customMem customMem)
  {
 -@@ -71,6 +73,7 @@ void* ZSTD_customCalloc(size_t size, ZSTD_customMem customMem)
 +@@ -71,6 +76,7 @@ void* ZSTD_customCalloc(size_t size, ZSTD_customMem customMem)
      }
      return ZSTD_calloc(1, size);
  }
 @@ -4311,7 +4393,7 @@
  
  void ZSTD_customFree(void* ptr, ZSTD_customMem customMem)
  {
 -@@ -81,3 +84,7 @@ void ZSTD_customFree(void* ptr, ZSTD_customMem customMem)
 +@@ -81,3 +87,7 @@ void ZSTD_customFree(void* ptr, ZSTD_customMem customMem)
              ZSTD_free(ptr);
      }
  }
 @@ -4935,10 +5017,10 @@
  EXPORT_SYMBOL_GPL(dirty_writeback_interval);
  
 diff --git a/mm/page_alloc.c b/mm/page_alloc.c
 -index e5486d47406e..cf131d6e08fb 100644
 +index d04211f0ef0b..cc6179d3a7dc 100644
 --- a/mm/page_alloc.c
 +++ b/mm/page_alloc.c
 -@@ -6982,11 +6982,11 @@ static int zone_batchsize(struct zone *zone)
 +@@ -7027,11 +7027,11 @@ static int zone_batchsize(struct zone *zone)
  
  	/*
  	 * The number of pages to batch allocate is either ~0.1%
 @@ -4952,7 +5034,7 @@
  	batch /= 4;		/* We effectively *= 4 below */
  	if (batch < 1)
  		batch = 1;
 -@@ -7064,6 +7064,7 @@ static int zone_highsize(struct zone *zone, int batch, int cpu_online)
 +@@ -7109,6 +7109,7 @@ static int zone_highsize(struct zone *zone, int batch, int cpu_online)
  	 * historical relationship between high and batch.
  	 */
  	high = max(high, batch << 2);
 @@ -4998,7 +5080,7 @@
  
  /*
 diff --git a/mm/vmscan.c b/mm/vmscan.c
 -index b2b1431352dc..0fc65ace3a4e 100644
 +index 382dbe97329f..fbc8c8f4fe60 100644
 --- a/mm/vmscan.c
 +++ b/mm/vmscan.c
 @@ -178,7 +178,11 @@ struct scan_control {
 @@ -5041,13 +5123,61 @@
  
  	init_net.ipv4.sysctl_tcp_wmem[0] = PAGE_SIZE;
  	init_net.ipv4.sysctl_tcp_wmem[1] = 16*1024;
 +diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib
 +index 3fb6a99e78c4..f62770a0a84f 100644
 +--- a/scripts/Makefile.lib
 ++++ b/scripts/Makefile.lib
 +@@ -504,14 +504,21 @@ quiet_cmd_xzmisc = XZMISC  $@
 + # decompression is used, like initramfs decompression, zstd22 should likely not
 + # be used because it would require zstd to allocate a 128 MB buffer.
 + 
 ++ifdef CONFIG_ZSTD_COMP_VAL
 ++zstd_comp_val	:= $(CONFIG_ZSTD_COMP_VAL)
 ++ifeq ($(shell test $(zstd_comp_val) -gt 19; echo $$?),0)
 ++zstd_comp_val	+= --ultra
 ++endif
 ++endif
 ++
 + quiet_cmd_zstd = ZSTD    $@
 +-      cmd_zstd = cat $(real-prereqs) | $(ZSTD) -19 > $@
 ++      cmd_zstd = cat $(real-prereqs) | $(ZSTD) -T0 -19 > $@
 + 
 + quiet_cmd_zstd22 = ZSTD22  $@
 +-      cmd_zstd22 = cat $(real-prereqs) | $(ZSTD) -22 --ultra > $@
 ++      cmd_zstd22 = cat $(real-prereqs) | $(ZSTD) -T0 -22 --ultra > $@
 + 
 + quiet_cmd_zstd22_with_size = ZSTD22  $@
 +-      cmd_zstd22_with_size = { cat $(real-prereqs) | $(ZSTD) -22 --ultra; $(size_append); } > $@
 ++      cmd_zstd22_with_size = { cat $(real-prereqs) | $(ZSTD) -T0 -$(zstd_comp_val); $(size_append); } > $@
 + 
 + # ASM offsets
 + # ---------------------------------------------------------------------------
 +diff --git a/scripts/Makefile.modinst b/scripts/Makefile.modinst
 +index a4c987c23750..132863cf3183 100644
 +--- a/scripts/Makefile.modinst
 ++++ b/scripts/Makefile.modinst
 +@@ -96,8 +96,13 @@ quiet_cmd_gzip = GZIP    $@
 +       cmd_gzip = $(KGZIP) -n -f $<
 + quiet_cmd_xz = XZ      $@
 +       cmd_xz = $(XZ) --lzma2=dict=2MiB -f $<
 ++ifdef CONFIG_MODULE_COMPRESS_ZSTD_ULTRA
 + quiet_cmd_zstd = ZSTD    $@
 +-      cmd_zstd = $(ZSTD) -T0 --rm -f -q $<
 ++      cmd_zstd = $(ZSTD) -$(CONFIG_MODULE_COMPRESS_ZSTD_LEVEL_ULTRA) --ultra --zstd=wlog=21 -T0 --rm -f -q $<
 ++else
 ++quiet_cmd_zstd = ZSTD    $@
 ++      cmd_zstd = $(ZSTD) -$(CONFIG_MODULE_COMPRESS_ZSTD_LEVEL) --zstd=wlog=21 -T0 --rm -f -q $<
 ++endif
 + 
 + $(dst)/%.ko.gz: $(dst)/%.ko FORCE
 + 	$(call cmd,gzip)
 -- 
 -2.38.0.rc1.8.g2a7d63a245
 +2.38.0.rc2
 
 -From 0feaada45827f920b03a53edea1d34597614db84 Mon Sep 17 00:00:00 2001
 +From 141640e23fd2ab7f136bf64267472cc06f74e7e5 Mon Sep 17 00:00:00 2001
 From: Peter Jung <[email protected]>
 Date: Mon, 5 Sep 2022 08:34:43 +0200
 -Subject: [PATCH 02/16] bbr2
 +Subject: [PATCH 02/17] bbr2
 
 Signed-off-by: Peter Jung <[email protected]>
 ---
 @@ -8714,12 +8844,12 @@
  	event = icsk->icsk_pending;
  
 -- 
 -2.38.0.rc1.8.g2a7d63a245
 +2.38.0.rc2
 
 -From 3a2a43e0dc41577b2d9262692c628362129d539d Mon Sep 17 00:00:00 2001
 +From a4b23da78754ee7604440d04fc79b263c397cb5c Mon Sep 17 00:00:00 2001
 From: Peter Jung <[email protected]>
 Date: Sun, 25 Sep 2022 23:49:46 +0200
 -Subject: [PATCH 03/16] futex-winesync
 +Subject: [PATCH 03/17] futex-winesync
 
 Signed-off-by: Peter Jung <[email protected]>
 ---
 @@ -9236,10 +9366,10 @@
 +  ``objs`` and in ``alert`` If this is attempted, the function fails
 +  with ``EINVAL``.
 diff --git a/MAINTAINERS b/MAINTAINERS
 -index f5ca4aefd184..31a7aa60cdc3 100644
 +index 72b9654f764c..ff31beb17835 100644
 --- a/MAINTAINERS
 +++ b/MAINTAINERS
 -@@ -21921,6 +21921,15 @@ M:	David Härdeman <[email protected]>
 +@@ -21920,6 +21920,15 @@ M:	David Härdeman <[email protected]>
  S:	Maintained
  F:	drivers/media/rc/winbond-cir.c
  
 @@ -12116,12 +12246,12 @@
 +
 +TEST_HARNESS_MAIN
 -- 
 -2.38.0.rc1.8.g2a7d63a245
 +2.38.0.rc2
 
 -From 0905ce4d17bc19b8ec54ef87ed8f42e365a2bcc2 Mon Sep 17 00:00:00 2001
 +From b09871d4f5597879fd54097962968b4a35785967 Mon Sep 17 00:00:00 2001
 From: Peter Jung <[email protected]>
 Date: Fri, 5 Aug 2022 19:33:47 +0200
 -Subject: [PATCH 04/16] Introducing-OpenVPN-Data-Channel-Offload
 +Subject: [PATCH 04/17] Introducing-OpenVPN-Data-Channel-Offload
 
 Signed-off-by: Peter Jung <[email protected]>
 ---
 @@ -12195,10 +12325,10 @@
  create mode 100644 include/uapi/linux/ovpn_dco.h
 
 diff --git a/MAINTAINERS b/MAINTAINERS
 -index 31a7aa60cdc3..a29c9731350c 100644
 +index ff31beb17835..594e31ec15cb 100644
 --- a/MAINTAINERS
 +++ b/MAINTAINERS
 -@@ -15320,6 +15320,14 @@ T:	git git://git.kernel.org/pub/scm/linux/kernel/git/mszeredi/vfs.git
 +@@ -15319,6 +15319,14 @@ T:	git git://git.kernel.org/pub/scm/linux/kernel/git/mszeredi/vfs.git
  F:	Documentation/filesystems/overlayfs.rst
  F:	fs/overlayfs/
  
 @@ -18283,12 +18413,12 @@
  
  #endif /* _UAPI_LINUX_UDP_H */
 -- 
 -2.38.0.rc1.8.g2a7d63a245
 +2.38.0.rc2
 
 -From 14903eee0b5577711272732705260cb83e5e0777 Mon Sep 17 00:00:00 2001
 +From 25b27cf5b605ab3b63df5a163037e6c8beadb5ca Mon Sep 17 00:00:00 2001
 From: Peter Jung <[email protected]>
 Date: Wed, 28 Sep 2022 00:26:01 +0200
 -Subject: [PATCH 05/16] mm/demotion: Memory tiers and demotion
 +Subject: [PATCH 05/17] mm/demotion: Memory tiers and demotion
 
 The current kernel has the basic memory tiering support: Inactive pages on
 a higher tier NUMA node can be migrated (demoted) to a lower tier NUMA
 @@ -18791,7 +18921,7 @@
  obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o khugepaged.o
  obj-$(CONFIG_PAGE_COUNTER) += page_counter.o
 diff --git a/mm/huge_memory.c b/mm/huge_memory.c
 -index e9414ee57c5b..6eb4b1799b79 100644
 +index f42bb51e023a..9efa67e45534 100644
 --- a/mm/huge_memory.c
 +++ b/mm/huge_memory.c
 @@ -36,6 +36,7 @@
 @@ -19541,7 +19671,7 @@
 +#endif /* CONFIG_SYSFS */
 +#endif
 diff --git a/mm/memory.c b/mm/memory.c
 -index 4ba73f5aa8bb..3a3d8721bf4c 100644
 +index a78814413ac0..7032db10622b 100644
 --- a/mm/memory.c
 +++ b/mm/memory.c
 @@ -66,6 +66,7 @@
 @@ -20034,7 +20164,7 @@
  #include <asm/mmu_context.h>
  #include <asm/tlbflush.h>
 diff --git a/mm/vmscan.c b/mm/vmscan.c
 -index 0fc65ace3a4e..e673be68cea3 100644
 +index fbc8c8f4fe60..710dcb1e253f 100644
 --- a/mm/vmscan.c
 +++ b/mm/vmscan.c
 @@ -43,6 +43,7 @@
 @@ -20165,12 +20295,12 @@
  	proc_create_seq("buddyinfo", 0444, NULL, &fragmentation_op);
  	proc_create_seq("pagetypeinfo", 0400, NULL, &pagetypeinfo_op);
 -- 
 -2.38.0.rc1.8.g2a7d63a245
 +2.38.0.rc2
 
 -From 30817d963bfdddf095e330e41317c9efceec642a Mon Sep 17 00:00:00 2001
 +From b7d5db9b461acbef045b7be4c93ac44be1bce034 Mon Sep 17 00:00:00 2001
 From: Peter Jung <[email protected]>
 Date: Wed, 28 Sep 2022 00:26:29 +0200
 -Subject: [PATCH 06/16] mm/khugepaged: add struct collapse_control
 +Subject: [PATCH 06/17] mm/khugepaged: add struct collapse_control
 
 Signed-off-by: Peter Jung <[email protected]>
 ---
 @@ -20340,7 +20470,7 @@
  #define MAP_FILE	0
  
 diff --git a/mm/huge_memory.c b/mm/huge_memory.c
 -index 6eb4b1799b79..42cdc3338adc 100644
 +index 9efa67e45534..dc2faf99f4f2 100644
 --- a/mm/huge_memory.c
 +++ b/mm/huge_memory.c
 @@ -71,9 +71,8 @@ static atomic_t huge_zero_refcount;
 @@ -20413,7 +20543,7 @@
  /*
   * in mm/page_alloc.c
 diff --git a/mm/khugepaged.c b/mm/khugepaged.c
 -index 01f71786d530..5f7c60b8b269 100644
 +index 70b7ac66411c..0bcba493ebb4 100644
 --- a/mm/khugepaged.c
 +++ b/mm/khugepaged.c
 @@ -28,6 +28,7 @@ enum scan_result {
 @@ -20994,7 +21124,7 @@
  		goto out_up_write;
  
  	anon_vma_lock_write(vma->anon_vma);
 -@@ -1093,11 +1081,11 @@ static void collapse_huge_page(struct mm_struct *mm,
 +@@ -1095,11 +1083,11 @@ static void collapse_huge_page(struct mm_struct *mm,
  	mmu_notifier_invalidate_range_end(&range);
  
  	spin_lock(pte_ptl);
 @@ -21009,7 +21139,7 @@
  		pte_unmap(pte);
  		spin_lock(pmd_ptl);
  		BUG_ON(!pmd_none(*pmd));
 -@@ -1109,7 +1097,6 @@ static void collapse_huge_page(struct mm_struct *mm,
 +@@ -1111,7 +1099,6 @@ static void collapse_huge_page(struct mm_struct *mm,
  		pmd_populate(mm, pmd, pmd_pgtable(_pmd));
  		spin_unlock(pmd_ptl);
  		anon_vma_unlock_write(vma->anon_vma);
 @@ -21017,7 +21147,7 @@
  		goto out_up_write;
  	}
  
 -@@ -1119,8 +1106,8 @@ static void collapse_huge_page(struct mm_struct *mm,
 +@@ -1121,8 +1108,8 @@ static void collapse_huge_page(struct mm_struct *mm,
  	 */
  	anon_vma_unlock_write(vma->anon_vma);
  
 @@ -21028,7 +21158,7 @@
  	pte_unmap(pte);
  	/*
  	 * spin_lock() below is not the equivalent of smp_wmb(), but
 -@@ -1128,42 +1115,43 @@ static void collapse_huge_page(struct mm_struct *mm,
 +@@ -1130,42 +1117,43 @@ static void collapse_huge_page(struct mm_struct *mm,
  	 * avoid the copy_huge_page writes to become visible after
  	 * the set_pmd_at() write.
  	 */
 @@ -21087,7 +21217,7 @@
  	int none_or_zero = 0, shared = 0;
  	struct page *page = NULL;
  	unsigned long _address;
 -@@ -1173,19 +1161,19 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
 +@@ -1175,19 +1163,19 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
  
  	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
  
 @@ -21113,7 +21243,7 @@
  				/*
  				 * Always be strict with uffd-wp
  				 * enabled swap entries.  Please see
 -@@ -1203,8 +1191,10 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
 +@@ -1205,8 +1193,10 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
  			}
  		}
  		if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
 @@ -21125,7 +21255,7 @@
  				continue;
  			} else {
  				result = SCAN_EXCEED_NONE_PTE;
 -@@ -1234,27 +1224,30 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
 +@@ -1236,27 +1226,30 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
  			goto out_unmap;
  		}
  
 @@ -21164,7 +21294,7 @@
  		if (!PageLRU(page)) {
  			result = SCAN_PAGE_LRU;
  			goto out_unmap;
 -@@ -1289,31 +1282,38 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
 +@@ -1291,31 +1284,38 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
  			result = SCAN_PAGE_COUNT;
  			goto out_unmap;
  		}
 @@ -21213,7 +21343,7 @@
  }
  
  static void collect_mm_slot(struct mm_slot *mm_slot)
 -@@ -1322,7 +1322,7 @@ static void collect_mm_slot(struct mm_slot *mm_slot)
 +@@ -1324,7 +1324,7 @@ static void collect_mm_slot(struct mm_slot *mm_slot)
  
  	lockdep_assert_held(&khugepaged_mm_lock);
  
 @@ -21222,7 +21352,7 @@
  		/* free mm_slot */
  		hash_del(&mm_slot->hash);
  		list_del(&mm_slot->mm_node);
 -@@ -1400,12 +1400,13 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
 +@@ -1402,12 +1402,13 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
  		return;
  
  	/*
 @@ -21241,7 +21371,7 @@
  		return;
  
  	/* Keep pmd pgtable for uffd-wp; see comment in retract_page_tables() */
 -@@ -1420,8 +1421,7 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
 +@@ -1422,8 +1423,7 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
  	if (!PageHead(hpage))
  		goto drop_hpage;
  
 @@ -21251,7 +21381,7 @@
  		goto drop_hpage;
  
  	start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl);
 -@@ -1495,7 +1495,7 @@ static void khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot)
 +@@ -1497,7 +1497,7 @@ static void khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot)
  	if (!mmap_write_trylock(mm))
  		return;
  
 @@ -21260,7 +21390,7 @@
  		goto out;
  
  	for (i = 0; i < mm_slot->nr_pte_mapped_thp; i++)
 -@@ -1539,8 +1539,7 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
 +@@ -1541,8 +1541,7 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
  		if (vma->vm_end < addr + HPAGE_PMD_SIZE)
  			continue;
  		mm = vma->vm_mm;
 @@ -21270,7 +21400,7 @@
  			continue;
  		/*
  		 * We need exclusive mmap_lock to retract page table.
 -@@ -1558,7 +1557,8 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
 +@@ -1560,7 +1559,8 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
  			 * it'll always mapped in small page size for uffd-wp
  			 * registered ranges.
  			 */
 @@ -21280,7 +21410,7 @@
  				collapse_and_free_pmd(mm, vma, addr, pmd);
  			mmap_write_unlock(mm);
  		} else {
 -@@ -1575,8 +1575,7 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
 +@@ -1577,8 +1577,7 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
   * @mm: process address space where collapse happens
   * @file: file that collapse on
   * @start: collapse start address
 @@ -21290,7 +21420,7 @@
   *
   * Basic scheme is simple, details are more complex:
   *  - allocate and lock a new huge page;
 -@@ -1593,13 +1592,11 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
 +@@ -1595,13 +1594,11 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
   *    + restore gaps in the page cache;
   *    + unlock and free huge page;
   */
 @@ -21307,7 +21437,7 @@
  	pgoff_t index, end = start + HPAGE_PMD_NR;
  	LIST_HEAD(pagelist);
  	XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER);
 -@@ -1610,20 +1607,9 @@ static void collapse_file(struct mm_struct *mm,
 +@@ -1612,20 +1609,9 @@ static void collapse_file(struct mm_struct *mm,
  	VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem);
  	VM_BUG_ON(start & (HPAGE_PMD_NR - 1));
  
 @@ -21330,7 +21460,7 @@
  
  	/*
  	 * Ensure we have slots for all the pages in the range.  This is
 -@@ -1641,14 +1627,14 @@ static void collapse_file(struct mm_struct *mm,
 +@@ -1643,14 +1629,14 @@ static void collapse_file(struct mm_struct *mm,
  		}
  	} while (1);
  
 @@ -21350,7 +21480,7 @@
  	 * It's safe to insert it into the page cache, because nobody would
  	 * be able to map it or use it in another way until we unlock it.
  	 */
 -@@ -1676,7 +1662,7 @@ static void collapse_file(struct mm_struct *mm,
 +@@ -1678,7 +1664,7 @@ static void collapse_file(struct mm_struct *mm,
  					result = SCAN_FAIL;
  					goto xa_locked;
  				}
 @@ -21359,7 +21489,7 @@
  				nr_none++;
  				continue;
  			}
 -@@ -1818,19 +1804,19 @@ static void collapse_file(struct mm_struct *mm,
 +@@ -1820,19 +1806,19 @@ static void collapse_file(struct mm_struct *mm,
  		list_add_tail(&page->lru, &pagelist);
  
  		/* Finally, replace with the new page. */
 @@ -21383,7 +21513,7 @@
  		filemap_nr_thps_inc(mapping);
  		/*
  		 * Paired with smp_mb() in do_dentry_open() to ensure
 -@@ -1841,21 +1827,21 @@ static void collapse_file(struct mm_struct *mm,
 +@@ -1843,21 +1829,21 @@ static void collapse_file(struct mm_struct *mm,
  		smp_mb();
  		if (inode_is_open_for_write(mapping->host)) {
  			result = SCAN_FAIL;
 @@ -21409,7 +21539,7 @@
  xa_locked:
  	xas_unlock_irq(&xas);
  xa_unlocked:
 -@@ -1877,11 +1863,11 @@ static void collapse_file(struct mm_struct *mm,
 +@@ -1879,11 +1865,11 @@ static void collapse_file(struct mm_struct *mm,
  		index = start;
  		list_for_each_entry_safe(page, tmp, &pagelist, lru) {
  			while (index < page->index) {
 @@ -21424,7 +21554,7 @@
  			list_del(&page->lru);
  			page->mapping = NULL;
  			page_ref_unfreeze(page, 1);
 -@@ -1892,23 +1878,22 @@ static void collapse_file(struct mm_struct *mm,
 +@@ -1894,23 +1880,22 @@ static void collapse_file(struct mm_struct *mm,
  			index++;
  		}
  		while (index < end) {
 @@ -21455,7 +21585,7 @@
  	} else {
  		struct page *page;
  
 -@@ -1947,19 +1932,23 @@ static void collapse_file(struct mm_struct *mm,
 +@@ -1949,19 +1934,23 @@ static void collapse_file(struct mm_struct *mm,
  		VM_BUG_ON(nr_none);
  		xas_unlock_irq(&xas);
  
 @@ -21485,7 +21615,7 @@
  {
  	struct page *page = NULL;
  	struct address_space *mapping = file->f_mapping;
 -@@ -1970,14 +1959,16 @@ static void khugepaged_scan_file(struct mm_struct *mm,
 +@@ -1972,14 +1961,16 @@ static void khugepaged_scan_file(struct mm_struct *mm,
  
  	present = 0;
  	swap = 0;
 @@ -21504,7 +21634,7 @@
  				result = SCAN_EXCEED_SWAP_PTE;
  				count_vm_event(THP_SCAN_EXCEED_SWAP_PTE);
  				break;
 -@@ -1995,11 +1986,11 @@ static void khugepaged_scan_file(struct mm_struct *mm,
 +@@ -1997,11 +1988,11 @@ static void khugepaged_scan_file(struct mm_struct *mm,
  		}
  
  		node = page_to_nid(page);
 @@ -21518,7 +21648,7 @@
  
  		if (!PageLRU(page)) {
  			result = SCAN_PAGE_LRU;
 -@@ -2028,20 +2019,21 @@ static void khugepaged_scan_file(struct mm_struct *mm,
 +@@ -2030,20 +2021,21 @@ static void khugepaged_scan_file(struct mm_struct *mm,
  	rcu_read_unlock();
  
  	if (result == SCAN_SUCCEED) {
 @@ -21545,7 +21675,7 @@
  {
  	BUILD_BUG();
  }
 -@@ -2051,8 +2043,8 @@ static void khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot)
 +@@ -2053,8 +2045,8 @@ static void khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot)
  }
  #endif
  
 @@ -21556,7 +21686,7 @@
  	__releases(&khugepaged_mm_lock)
  	__acquires(&khugepaged_mm_lock)
  {
 -@@ -2063,6 +2055,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
 +@@ -2065,6 +2057,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
  
  	VM_BUG_ON(!pages);
  	lockdep_assert_held(&khugepaged_mm_lock);
 @@ -21564,7 +21694,7 @@
  
  	if (khugepaged_scan.mm_slot)
  		mm_slot = khugepaged_scan.mm_slot;
 -@@ -2083,7 +2076,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
 +@@ -2085,7 +2078,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
  	vma = NULL;
  	if (unlikely(!mmap_read_trylock(mm)))
  		goto breakouterloop_mmap_lock;
 @@ -21573,7 +21703,7 @@
  		vma = find_vma(mm, khugepaged_scan.address);
  
  	progress++;
 -@@ -2091,11 +2084,11 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
 +@@ -2093,11 +2086,11 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
  		unsigned long hstart, hend;
  
  		cond_resched();
 @@ -21587,7 +21717,7 @@
  skip:
  			progress++;
  			continue;
 -@@ -2109,9 +2102,10 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
 +@@ -2111,9 +2104,10 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
  		VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK);
  
  		while (khugepaged_scan.address < hend) {
 @@ -21600,7 +21730,7 @@
  				goto breakouterloop;
  
  			VM_BUG_ON(khugepaged_scan.address < hstart ||
 -@@ -2123,19 +2117,29 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
 +@@ -2125,19 +2119,29 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
  						khugepaged_scan.address);
  
  				mmap_read_unlock(mm);
 @@ -21637,7 +21767,7 @@
  				goto breakouterloop_mmap_lock;
  			if (progress >= pages)
  				goto breakouterloop;
 -@@ -2151,7 +2155,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
 +@@ -2153,7 +2157,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
  	 * Release the current mm_slot if this mm is about to die, or
  	 * if we scanned all vmas of this mm.
  	 */
 @@ -21646,7 +21776,7 @@
  		/*
  		 * Make sure that if mm_users is reaching zero while
  		 * khugepaged runs here, khugepaged_exit will find
 -@@ -2185,19 +2189,16 @@ static int khugepaged_wait_event(void)
 +@@ -2187,19 +2191,16 @@ static int khugepaged_wait_event(void)
  		kthread_should_stop();
  }
  
 @@ -21669,7 +21799,7 @@
  		cond_resched();
  
  		if (unlikely(kthread_should_stop() || try_to_freeze()))
 -@@ -2209,14 +2210,25 @@ static void khugepaged_do_scan(void)
 +@@ -2211,14 +2212,25 @@ static void khugepaged_do_scan(void)
  		if (khugepaged_has_work() &&
  		    pass_through_head < 2)
  			progress += khugepaged_scan_mm_slot(pages - progress,
 @@ -21699,7 +21829,7 @@
  }
  
  static bool khugepaged_should_wakeup(void)
 -@@ -2253,7 +2265,7 @@ static int khugepaged(void *none)
 +@@ -2255,7 +2267,7 @@ static int khugepaged(void *none)
  	set_user_nice(current, MAX_NICE);
  
  	while (!kthread_should_stop()) {
 @@ -21708,7 +21838,7 @@
  		khugepaged_wait_work();
  	}
  
 -@@ -2352,3 +2364,120 @@ void khugepaged_min_free_kbytes_update(void)
 +@@ -2354,3 +2366,120 @@ void khugepaged_min_free_kbytes_update(void)
  		set_recommended_min_free_kbytes();
  	mutex_unlock(&khugepaged_mutex);
  }
 @@ -21858,7 +21988,7 @@
  	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr,
  				addr + PAGE_SIZE);
 diff --git a/mm/madvise.c b/mm/madvise.c
 -index 5f0f0948a50e..af97100a0727 100644
 +index 9ff51650f4f0..4f86eb7f554d 100644
 --- a/mm/madvise.c
 +++ b/mm/madvise.c
 @@ -59,6 +59,7 @@ static int madvise_need_mmap_write(int behavior)
 @@ -21869,7 +21999,7 @@
  		return 0;
  	default:
  		/* be safe, default to 1. list exceptions explicitly */
 -@@ -1057,6 +1058,8 @@ static int madvise_vma_behavior(struct vm_area_struct *vma,
 +@@ -1060,6 +1061,8 @@ static int madvise_vma_behavior(struct vm_area_struct *vma,
  		if (error)
  			goto out;
  		break;
 @@ -21878,7 +22008,7 @@
  	}
  
  	anon_name = anon_vma_name(vma);
 -@@ -1150,6 +1153,7 @@ madvise_behavior_valid(int behavior)
 +@@ -1153,6 +1156,7 @@ madvise_behavior_valid(int behavior)
  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
  	case MADV_HUGEPAGE:
  	case MADV_NOHUGEPAGE:
 @@ -21886,7 +22016,7 @@
  #endif
  	case MADV_DONTDUMP:
  	case MADV_DODUMP:
 -@@ -1166,13 +1170,13 @@ madvise_behavior_valid(int behavior)
 +@@ -1169,13 +1173,13 @@ madvise_behavior_valid(int behavior)
  	}
  }
  
 @@ -21902,7 +22032,7 @@
  		return true;
  	default:
  		return false;
 -@@ -1339,6 +1343,7 @@ int madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
 +@@ -1342,6 +1346,7 @@ int madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
   *  MADV_NOHUGEPAGE - mark the given range as not worth being backed by
   *		transparent huge pages so the existing pages will not be
   *		coalesced into THP and new pages will not be allocated as THP.
 @@ -21911,10 +22041,10 @@
   *		from being included in its core dump.
   *  MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump.
 diff --git a/mm/memory.c b/mm/memory.c
 -index 3a3d8721bf4c..e58d5d522467 100644
 +index 7032db10622b..eccc236d1351 100644
 --- a/mm/memory.c
 +++ b/mm/memory.c
 -@@ -4986,7 +4986,7 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
 +@@ -4992,7 +4992,7 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
  		return VM_FAULT_OOM;
  retry_pud:
  	if (pud_none(*vmf.pud) &&
 @@ -21923,7 +22053,7 @@
  		ret = create_huge_pud(&vmf);
  		if (!(ret & VM_FAULT_FALLBACK))
  			return ret;
 -@@ -5020,7 +5020,7 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
 +@@ -5026,7 +5026,7 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
  		goto retry_pud;
  
  	if (pmd_none(*vmf.pmd) &&
 @@ -22908,12 +23038,12 @@
  	restore_settings(0);
  }
 -- 
 -2.38.0.rc1.8.g2a7d63a245
 +2.38.0.rc2
 
 -From 3430d4868012555c67c2ec34b073b0e4ecda986d Mon Sep 17 00:00:00 2001
 +From 34110cc92398bd9e82b17a78b64f1f1db3d297ca Mon Sep 17 00:00:00 2001
 From: Peter Jung <[email protected]>
 -Date: Wed, 28 Sep 2022 00:26:48 +0200
 -Subject: [PATCH 07/16] mm: multi-gen LRU
 +Date: Thu, 29 Sep 2022 14:28:01 +0200
 +Subject: [PATCH 07/17] mm: multi-gen LRU
 
 Signed-off-by: Peter Jung <[email protected]>
 ---
 @@ -22954,9 +23084,9 @@
  mm/mmzone.c                                   |    2 +
  mm/rmap.c                                     |    6 +
  mm/swap.c                                     |   54 +-
 - mm/vmscan.c                                   | 3253 +++++++++++++++--
 + mm/vmscan.c                                   | 3250 +++++++++++++++--
  mm/workingset.c                               |  110 +-
 - 39 files changed, 4252 insertions(+), 286 deletions(-)
 + 39 files changed, 4249 insertions(+), 286 deletions(-)
  create mode 100644 Documentation/admin-guide/mm/multigen_lru.rst
  create mode 100644 Documentation/mm/multigen_lru.rst
 
 @@ -23505,7 +23635,7 @@
  					 struct task_struct *t) { return 0; }
  static inline int cgroupstats_build(struct cgroupstats *stats,
 diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
 -index 6257867fbf95..207cfd3b42e5 100644
 +index 567f12323f55..877cbcbc6ed9 100644
 --- a/include/linux/memcontrol.h
 +++ b/include/linux/memcontrol.h
 @@ -350,6 +350,11 @@ struct mem_cgroup {
 @@ -24495,7 +24625,7 @@
  
  endmenu
 diff --git a/mm/huge_memory.c b/mm/huge_memory.c
 -index 42cdc3338adc..786497dd5f26 100644
 +index dc2faf99f4f2..324c2d68610b 100644
 --- a/mm/huge_memory.c
 +++ b/mm/huge_memory.c
 @@ -2423,7 +2423,8 @@ static void __split_huge_page_tail(struct page *head, int tail,
 @@ -24588,7 +24718,7 @@
  	.post_attach = mem_cgroup_move_task,
  	.dfl_cftypes = memory_files,
 diff --git a/mm/memory.c b/mm/memory.c
 -index e58d5d522467..bc4dc2e45dcc 100644
 +index eccc236d1351..2c0e794b8093 100644
 --- a/mm/memory.c
 +++ b/mm/memory.c
 @@ -126,18 +126,6 @@ int randomize_va_space __read_mostly =
 @@ -24619,7 +24749,7 @@
  		pte_t entry;
  
  		vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl);
 -@@ -5115,6 +5103,27 @@ static inline void mm_account_fault(struct pt_regs *regs,
 +@@ -5121,6 +5109,27 @@ static inline void mm_account_fault(struct pt_regs *regs,
  		perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address);
  }
  
 @@ -24647,7 +24777,7 @@
  /*
   * By the time we get here, we already hold the mm semaphore
   *
 -@@ -5146,11 +5155,15 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
 +@@ -5152,11 +5161,15 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
  	if (flags & FAULT_FLAG_USER)
  		mem_cgroup_enter_user_fault();
  
 @@ -24824,7 +24954,7 @@
  
  		folio_get(folio);
 diff --git a/mm/vmscan.c b/mm/vmscan.c
 -index e673be68cea3..feb8416d8edd 100644
 +index 710dcb1e253f..d4926208fe86 100644
 --- a/mm/vmscan.c
 +++ b/mm/vmscan.c
 @@ -50,6 +50,10 @@
 @@ -24989,7 +25119,7 @@
  /*
   * Determine how aggressively the anon and file LRU lists should be
   * scanned.
 -@@ -2980,159 +3103,2912 @@ static bool can_age_anon_pages(struct pglist_data *pgdat,
 +@@ -2980,159 +3103,2909 @@ static bool can_age_anon_pages(struct pglist_data *pgdat,
  	return can_demote(pgdat->node_id, sc);
  }
  
 @@ -26445,8 +26575,6 @@
 +	if (wq_has_sleeper(&lruvec->mm_state.wait))
 +		wake_up_all(&lruvec->mm_state.wait);
 +
 -+	wakeup_flusher_threads(WB_REASON_VMSCAN);
 -+
 +	return true;
 +}
 +
 @@ -27110,7 +27238,7 @@
 +	DEFINE_MAX_SEQ(lruvec);
 +
 +	if (!current_is_kswapd()) {
 -+		/* age each memcg once to ensure fairness */
 ++		/* age each memcg at most once to ensure fairness */
 +		if (max_seq - seq > 1)
 +			return true;
 +
 @@ -27135,10 +27263,9 @@
 +
 +	/*
 +	 * A minimum amount of work was done under global memory pressure. For
 -+	 * kswapd, it may be overshooting. For direct reclaim, the target isn't
 -+	 * met, and yet the allocation may still succeed, since kswapd may have
 -+	 * caught up. In either case, it's better to stop now, and restart if
 -+	 * necessary.
 ++	 * kswapd, it may be overshooting. For direct reclaim, the allocation
 ++	 * may succeed if all suitable zones are somewhat safe. In either case,
 ++	 * it's better to stop now, and restart later if necessary.
 +	 */
 +	for (i = 0; i <= sc->reclaim_idx; i++) {
 +		unsigned long wmark;
 @@ -28030,7 +28157,7 @@
  	 * where always a non-zero amount of pages were scanned.
  	 */
  	if (!nr_reclaimed)
 -@@ -3230,109 +6106,16 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)
 +@@ -3230,109 +6103,16 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)
  	unsigned long nr_reclaimed, nr_scanned;
  	struct lruvec *target_lruvec;
  	bool reclaimable = false;
 @@ -28141,7 +28268,7 @@
  
  	shrink_node_memcgs(pgdat, sc);
  
 -@@ -3590,11 +6373,14 @@ static void snapshot_refaults(struct mem_cgroup *target_memcg, pg_data_t *pgdat)
 +@@ -3590,11 +6370,14 @@ static void snapshot_refaults(struct mem_cgroup *target_memcg, pg_data_t *pgdat)
  	struct lruvec *target_lruvec;
  	unsigned long refaults;
  
 @@ -28158,7 +28285,7 @@
  }
  
  /*
 -@@ -3956,12 +6742,16 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
 +@@ -3956,12 +6739,16 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
  }
  #endif
  
 @@ -28177,7 +28304,7 @@
  	if (!can_age_anon_pages(pgdat, sc))
  		return;
  
 -@@ -4281,12 +7071,11 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx)
 +@@ -4281,12 +7068,11 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx)
  		sc.may_swap = !nr_boost_reclaim;
  
  		/*
 @@ -28345,12 +28472,12 @@
  	rcu_read_lock();
  	/*
 -- 
 -2.38.0.rc1.8.g2a7d63a245
 +2.38.0.rc2
 
 -From f7046da0d2b40d6725122f9d3ed897a12a8fda63 Mon Sep 17 00:00:00 2001
 +From 390083dc23a0cad9d4870a1f4bd5984760f94bf4 Mon Sep 17 00:00:00 2001
 From: Peter Jung <[email protected]>
 Date: Wed, 28 Sep 2022 00:27:32 +0200
 -Subject: [PATCH 08/16] Introducing the Maple Tree
 +Subject: [PATCH 08/17] Introducing the Maple Tree
 
 The maple tree is an RCU-safe range based B-tree designed to use modern
 processor cache efficiently.  There are a number of places in the kernel
 @@ -28772,10 +28899,10 @@
 +.. kernel-doc:: include/linux/maple_tree.h
 +.. kernel-doc:: lib/maple_tree.c
 diff --git a/MAINTAINERS b/MAINTAINERS
 -index a29c9731350c..96a09757feb3 100644
 +index 594e31ec15cb..9a5a422817af 100644
 --- a/MAINTAINERS
 +++ b/MAINTAINERS
 -@@ -12094,6 +12094,18 @@ L:	[email protected]
 +@@ -12093,6 +12093,18 @@ L:	[email protected]
  S:	Maintained
  W:	http://www.kernel.org/doc/man-pages
  
 @@ -29367,10 +29494,10 @@
  
  		if (!vma || (msg->va != vma->vm_start) || vma->vm_private_data)
 diff --git a/fs/coredump.c b/fs/coredump.c
 -index 9f4aae202109..35f2af85b9bc 100644
 +index 1ab4f5b76a1e..debcebabcd73 100644
 --- a/fs/coredump.c
 +++ b/fs/coredump.c
 -@@ -1072,30 +1072,20 @@ static unsigned long vma_dump_size(struct vm_area_struct *vma,
 +@@ -1100,30 +1100,20 @@ static unsigned long vma_dump_size(struct vm_area_struct *vma,
  	return vma->vm_end - vma->vm_start;
  }
  
 @@ -29408,7 +29535,7 @@
  	return gate_vma;
  }
  
 -@@ -1119,9 +1109,10 @@ static void free_vma_snapshot(struct coredump_params *cprm)
 +@@ -1147,9 +1137,10 @@ static void free_vma_snapshot(struct coredump_params *cprm)
   */
  static bool dump_vma_snapshot(struct coredump_params *cprm)
  {
 @@ -29421,7 +29548,7 @@
  
  	/*
  	 * Once the stack expansion code is fixed to not change VMA bounds
 -@@ -1141,8 +1132,7 @@ static bool dump_vma_snapshot(struct coredump_params *cprm)
 +@@ -1169,8 +1160,7 @@ static bool dump_vma_snapshot(struct coredump_params *cprm)
  		return false;
  	}
  
 @@ -29431,7 +29558,7 @@
  		struct core_vma_metadata *m = cprm->vma_meta + i;
  
  		m->start = vma->vm_start;
 -@@ -1150,10 +1140,10 @@ static bool dump_vma_snapshot(struct coredump_params *cprm)
 +@@ -1178,10 +1168,10 @@ static bool dump_vma_snapshot(struct coredump_params *cprm)
  		m->flags = vma->vm_flags;
  		m->dump_size = vma_dump_size(vma, cprm->mm_flags);
  		m->pgoff = vma->vm_pgoff;
 @@ -31470,10 +31597,10 @@
  	flush_icache_range(addr, addr + BREAK_INSTR_SIZE);
  }
 diff --git a/kernel/events/core.c b/kernel/events/core.c
 -index 2621fd24ad26..101c5912c3fc 100644
 +index ff4bffc502c6..7a23df62d2e4 100644
 --- a/kernel/events/core.c
 +++ b/kernel/events/core.c
 -@@ -10229,8 +10229,9 @@ static void perf_addr_filter_apply(struct perf_addr_filter *filter,
 +@@ -10238,8 +10238,9 @@ static void perf_addr_filter_apply(struct perf_addr_filter *filter,
  				   struct perf_addr_filter_range *fr)
  {
  	struct vm_area_struct *vma;
 @@ -77436,7 +77563,7 @@
  		atomic_read(&mm->mm_count),
  		mm_pgtables_bytes(mm),
 diff --git a/mm/gup.c b/mm/gup.c
 -index 5abdaf487460..5f3c464dbce1 100644
 +index 00926abb4426..4da7f1e3bba2 100644
 --- a/mm/gup.c
 +++ b/mm/gup.c
 @@ -1667,10 +1667,11 @@ int __mm_populate(unsigned long start, unsigned long len, int ignore_errors)
 @@ -77455,7 +77582,7 @@
  		/*
  		 * Set [nstart; nend) to intersection of desired address
 diff --git a/mm/huge_memory.c b/mm/huge_memory.c
 -index 786497dd5f26..cca500fcfb64 100644
 +index 324c2d68610b..51f8e41b6568 100644
 --- a/mm/huge_memory.c
 +++ b/mm/huge_memory.c
 @@ -2319,11 +2319,11 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma,
 @@ -77520,10 +77647,10 @@
  
  #ifdef CONFIG_MMU
 diff --git a/mm/khugepaged.c b/mm/khugepaged.c
 -index 5f7c60b8b269..df890338daed 100644
 +index 0bcba493ebb4..256a9c7976f9 100644
 --- a/mm/khugepaged.c
 +++ b/mm/khugepaged.c
 -@@ -1387,7 +1387,7 @@ static void collapse_and_free_pmd(struct mm_struct *mm, struct vm_area_struct *v
 +@@ -1389,7 +1389,7 @@ static void collapse_and_free_pmd(struct mm_struct *mm, struct vm_area_struct *v
  void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
  {
  	unsigned long haddr = addr & HPAGE_PMD_MASK;
 @@ -77532,7 +77659,7 @@
  	struct page *hpage;
  	pte_t *start_pte, *pte;
  	pmd_t *pmd;
 -@@ -2048,6 +2048,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
 +@@ -2050,6 +2050,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
  	__releases(&khugepaged_mm_lock)
  	__acquires(&khugepaged_mm_lock)
  {
 @@ -77540,7 +77667,7 @@
  	struct mm_slot *mm_slot;
  	struct mm_struct *mm;
  	struct vm_area_struct *vma;
 -@@ -2076,11 +2077,13 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
 +@@ -2078,11 +2079,13 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
  	vma = NULL;
  	if (unlikely(!mmap_read_trylock(mm)))
  		goto breakouterloop_mmap_lock;
 @@ -77613,10 +77740,10 @@
  		ksm_scan.rmap_list = &slot->rmap_list;
  	}
 diff --git a/mm/madvise.c b/mm/madvise.c
 -index af97100a0727..682e1d161aef 100644
 +index 4f86eb7f554d..a3fc4cd32ed3 100644
 --- a/mm/madvise.c
 +++ b/mm/madvise.c
 -@@ -1242,7 +1242,7 @@ int madvise_walk_vmas(struct mm_struct *mm, unsigned long start,
 +@@ -1245,7 +1245,7 @@ int madvise_walk_vmas(struct mm_struct *mm, unsigned long start,
  		if (start >= end)
  			break;
  		if (prev)
 @@ -77650,7 +77777,7 @@
  	atomic_dec(&mc.from->moving_account);
  }
 diff --git a/mm/memory.c b/mm/memory.c
 -index bc4dc2e45dcc..acc2e88f4984 100644
 +index 2c0e794b8093..de427784f29d 100644
 --- a/mm/memory.c
 +++ b/mm/memory.c
 @@ -391,12 +391,21 @@ void free_pgd_range(struct mmu_gather *tlb,
 @@ -81568,7 +81695,7 @@
  	}
  	mmap_read_unlock(mm);
 diff --git a/mm/util.c b/mm/util.c
 -index c9439c66d8cf..1266a33a49ea 100644
 +index 346e40177bc6..50427596f208 100644
 --- a/mm/util.c
 +++ b/mm/util.c
 @@ -272,38 +272,6 @@ void *memdup_user_nul(const void __user *src, size_t len)
 @@ -81734,7 +81861,7 @@
 -}
 -#endif
 diff --git a/mm/vmscan.c b/mm/vmscan.c
 -index feb8416d8edd..f85a9c915d75 100644
 +index d4926208fe86..301f38d3165b 100644
 --- a/mm/vmscan.c
 +++ b/mm/vmscan.c
 @@ -3778,23 +3778,17 @@ static bool get_next_vma(unsigned long mask, unsigned long size, struct mm_walk
 @@ -82178,12 +82305,12 @@
 +#define trace_ma_read(a, b) do {} while (0)
 +#define trace_ma_write(a, b, c, d) do {} while (0)
 -- 
 -2.38.0.rc1.8.g2a7d63a245
 +2.38.0.rc2
 
 -From 79eeeac092d265211e4f6ce60f69ad549d8a201c Mon Sep 17 00:00:00 2001
 +From a18e54491eba670bdaea5b3d27131fea0e96726b Mon Sep 17 00:00:00 2001
 From: Peter Jung <[email protected]>
 Date: Mon, 26 Sep 2022 00:18:41 +0200
 -Subject: [PATCH 09/16] mm-cleanup
 +Subject: [PATCH 09/17] mm-cleanup
 
 Signed-off-by: Peter Jung <[email protected]>
 ---
 @@ -82320,7 +82447,7 @@
  
  static inline bool vma_soft_dirty_enabled(struct vm_area_struct *vma)
 diff --git a/mm/page_alloc.c b/mm/page_alloc.c
 -index cf131d6e08fb..292ed1bb6a5a 100644
 +index cc6179d3a7dc..65ffd285db54 100644
 --- a/mm/page_alloc.c
 +++ b/mm/page_alloc.c
 @@ -870,7 +870,8 @@ static inline bool set_page_guard(struct zone *zone, struct page *page,
 @@ -82381,7 +82508,7 @@
   */
  static inline void zone_statistics(struct zone *preferred_zone, struct zone *z,
  				   long nr_account)
 -@@ -5121,7 +5115,8 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
 +@@ -5147,7 +5141,8 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
  
  	reserve_flags = __gfp_pfmemalloc_flags(gfp_mask);
  	if (reserve_flags)
 @@ -82391,7 +82518,7 @@
  
  	/*
  	 * Reset the nodemask and zonelist iterators if memory policies can be
 -@@ -5238,7 +5233,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
 +@@ -5272,7 +5267,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
  		 * so that we can identify them and convert them to something
  		 * else.
  		 */
 @@ -82400,7 +82527,7 @@
  
  		/*
  		 * Help non-failing allocations by giving them access to memory
 -@@ -6507,7 +6502,7 @@ static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonesta
 +@@ -6553,7 +6548,7 @@ static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonesta
  #define BOOT_PAGESET_BATCH	1
  static DEFINE_PER_CPU(struct per_cpu_pages, boot_pageset);
  static DEFINE_PER_CPU(struct per_cpu_zonestat, boot_zonestats);
 @@ -82409,7 +82536,7 @@
  
  static void __build_all_zonelists(void *data)
  {
 -@@ -6810,7 +6805,7 @@ void __ref memmap_init_zone_device(struct zone *zone,
 +@@ -6855,7 +6850,7 @@ void __ref memmap_init_zone_device(struct zone *zone,
  	unsigned long start = jiffies;
  	int nid = pgdat->node_id;
  
 @@ -82418,7 +82545,7 @@
  		return;
  
  	/*
 -@@ -6986,7 +6981,7 @@ static int zone_batchsize(struct zone *zone)
 +@@ -7031,7 +7026,7 @@ static int zone_batchsize(struct zone *zone)
  	 * size is striking a balance between allocation latency
  	 * and zone lock contention.
  	 */
 @@ -82427,7 +82554,7 @@
  	batch /= 4;		/* We effectively *= 4 below */
  	if (batch < 1)
  		batch = 1;
 -@@ -7171,6 +7166,17 @@ void __meminit setup_zone_pageset(struct zone *zone)
 +@@ -7216,6 +7211,17 @@ void __meminit setup_zone_pageset(struct zone *zone)
  	zone_set_pageset_high_and_batch(zone, 0);
  }
  
 @@ -82445,7 +82572,7 @@
  /*
   * Allocate per cpu pagesets and initialize them.
   * Before this call only boot pagesets were available.
 -@@ -8461,8 +8467,8 @@ void __init mem_init_print_info(void)
 +@@ -8506,8 +8512,8 @@ void __init mem_init_print_info(void)
  #endif
  		")\n",
  		K(nr_free_pages()), K(physpages),
 @@ -82456,7 +82583,7 @@
  		K(physpages - totalram_pages() - totalcma_pages),
  		K(totalcma_pages)
  #ifdef	CONFIG_HIGHMEM
 -@@ -8987,8 +8993,8 @@ void *__init alloc_large_system_hash(const char *tablename,
 +@@ -9032,8 +9038,8 @@ void *__init alloc_large_system_hash(const char *tablename,
  		numentries -= arch_reserved_kernel_pages();
  
  		/* It isn't necessary when PAGE_SIZE >= 1MB */
 @@ -82467,7 +82594,7 @@
  
  #if __BITS_PER_LONG > 32
  		if (!high_limit) {
 -@@ -9412,17 +9418,6 @@ void free_contig_range(unsigned long pfn, unsigned long nr_pages)
 +@@ -9457,17 +9463,6 @@ void free_contig_range(unsigned long pfn, unsigned long nr_pages)
  }
  EXPORT_SYMBOL(free_contig_range);
  
 @@ -82485,7 +82612,7 @@
  /*
   * Effectively disable pcplists for the zone by setting the high limit to 0
   * and draining all cpus. A concurrent page freeing on another CPU that's about
 -@@ -9455,9 +9450,11 @@ void zone_pcp_reset(struct zone *zone)
 +@@ -9500,9 +9495,11 @@ void zone_pcp_reset(struct zone *zone)
  			drain_zonestat(zone, pzstats);
  		}
  		free_percpu(zone->per_cpu_pageset);
 @@ -82500,12 +82627,1194 @@
  }
  
 -- 
 -2.38.0.rc1.8.g2a7d63a245
 +2.38.0.rc2
 
 -From 6257c94a850dc4b3faa5a55be5831de4f8777cac Mon Sep 17 00:00:00 2001
 +From f5b84ebf4e16a85f85aad297a18df2f6d58a7ace Mon Sep 17 00:00:00 2001
 +From: Peter Jung <[email protected]>
 +Date: Wed, 28 Sep 2022 19:47:35 +0200
 +Subject: [PATCH 10/17] THP Shrinker
 +
 +Transparent Hugepages use a larger page size of 2MB in comparison to
 +normal sized pages that are 4kb. A larger page size allows for fewer TLB
 +cache misses and thus more efficient use of the CPU. Using a larger page
 +size also results in more memory waste, which can hurt performance in some
 +use cases. THPs are currently enabled in the Linux Kernel by applications
 +in limited virtual address ranges via the madvise system call.  The THP
 +shrinker tries to find a balance between increased use of THPs, and
 +increased use of memory. It shrinks the size of memory by removing the
 +underutilized THPs that are identified by the thp_utilization scanner.
 +
 +In our experiments we have noticed that the least utilized THPs are almost
 +entirely unutilized.
 +
 +Sample Output:
 +
 +Utilized[0-50]: 1331 680884
 +Utilized[51-101]: 9 3983
 +Utilized[102-152]: 3 1187
 +Utilized[153-203]: 0 0
 +Utilized[204-255]: 2 539
 +Utilized[256-306]: 5 1135
 +Utilized[307-357]: 1 192
 +Utilized[358-408]: 0 0
 +Utilized[409-459]: 1 57
 +Utilized[460-512]: 400 13
 +Last Scan Time: 223.98s
 +Last Scan Duration: 70.65s
 +
 +Above is a sample obtained from one of our test machines when THP is always
 +enabled. Of the 1331 THPs in this thp_utilization sample that have from
 +0-50 utilized subpages, we see that there are 680884 free pages. This
 +comes out to 680884 / (512 * 1331) = 99.91% zero pages in the least
 +utilized bucket. This represents 680884 * 4KB = 2.7GB memory waste.
 +
 +Also note that the vast majority of pages are either in the least utilized
 +[0-50] or most utilized [460-512] buckets. The least utilized THPs are
 +responsible for almost all of the memory waste when THP is always
 +enabled. Thus by clearing out THPs in the lowest utilization bucket
 +we extract most of the improvement in CPU efficiency. We have seen
 +similar results on our production hosts.
 +
 +This patchset introduces the THP shrinker we have developed to identify
 +and split the least utilized THPs. It includes the thp_utilization
 +changes that groups anonymous THPs into buckets, the split_huge_page()
 +changes that identify and zap zero 4KB pages within THPs and the shrinker
 +changes. It should be noted that the split_huge_page() changes are based
 +off previous work done by Yu Zhao.
 +
 +In the future, we intend to allow additional tuning to the shrinker
 +based on workload depending on CPU/IO/Memory pressure and the
 +amount of anonymous memory. The long term goal is to eventually always
 +enable THP for all applications and deprecate madvise entirely.
 +
 +In production we thus far have observed 2-3% reduction in overall cpu
 +usage on stateless web servers when THP is always enabled.
 +
 +Signed-off-by: Peter Jung <[email protected]>
 +---
 + Documentation/admin-guide/mm/transhuge.rst    |   9 +
 + include/linux/huge_mm.h                       |  10 +
 + include/linux/list_lru.h                      |  24 ++
 + include/linux/mm_types.h                      |   5 +
 + include/linux/rmap.h                          |   2 +-
 + include/linux/vm_event_item.h                 |   3 +
 + mm/huge_memory.c                              | 342 +++++++++++++++++-
 + mm/list_lru.c                                 |  49 +++
 + mm/migrate.c                                  |  72 +++-
 + mm/migrate_device.c                           |   4 +-
 + mm/page_alloc.c                               |   6 +
 + mm/vmstat.c                                   |   3 +
 + .../selftests/vm/split_huge_page_test.c       | 113 +++++-
 + tools/testing/selftests/vm/vm_util.c          |  23 ++
 + tools/testing/selftests/vm/vm_util.h          |   1 +
 + 15 files changed, 648 insertions(+), 18 deletions(-)
 +
 +diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst
 +index c9c37f16eef8..d883ff9fddc7 100644
 +--- a/Documentation/admin-guide/mm/transhuge.rst
 ++++ b/Documentation/admin-guide/mm/transhuge.rst
 +@@ -297,6 +297,15 @@ To identify what applications are mapping file transparent huge pages, it
 + is necessary to read ``/proc/PID/smaps`` and count the FileHugeMapped fields
 + for each mapping.
 + 
 ++The utilization of transparent hugepages can be viewed by reading
 ++``/sys/kernel/debug/thp_utilization``. The utilization of a THP is defined
 ++as the ratio of non zero filled 4kb pages to the total number of pages in a
 ++THP. The buckets are labelled by the range of total utilized 4kb pages with
 ++one line per utilization bucket. Each line contains the total number of
 ++THPs in that bucket and the total number of zero filled 4kb pages summed
 ++over all THPs in that bucket. The last two lines show the timestamp and
 ++duration respectively of the most recent scan over all of physical memory.
 ++
 + Note that reading the smaps file is expensive and reading it
 + frequently will incur overhead.
 + 
 +diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
 +index 38265f9f782e..c5400a89ce67 100644
 +--- a/include/linux/huge_mm.h
 ++++ b/include/linux/huge_mm.h
 +@@ -178,6 +178,9 @@ bool hugepage_vma_check(struct vm_area_struct *vma, unsigned long vm_flags,
 + unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
 + 		unsigned long len, unsigned long pgoff, unsigned long flags);
 + 
 ++int thp_number_utilized_pages(struct page *page);
 ++int thp_utilization_bucket(int num_utilized_pages);
 ++
 + void prep_transhuge_page(struct page *page);
 + void free_transhuge_page(struct page *page);
 + 
 +@@ -189,6 +192,8 @@ static inline int split_huge_page(struct page *page)
 + }
 + void deferred_split_huge_page(struct page *page);
 + 
 ++void add_underutilized_thp(struct page *page);
 ++
 + void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
 + 		unsigned long address, bool freeze, struct folio *folio);
 + 
 +@@ -302,6 +307,11 @@ static inline struct list_head *page_deferred_list(struct page *page)
 + 	return &page[2].deferred_list;
 + }
 + 
 ++static inline struct list_head *page_underutilized_thp_list(struct page *page)
 ++{
 ++       return &page[3].underutilized_thp_list;
 ++}
 ++
 + #else /* CONFIG_TRANSPARENT_HUGEPAGE */
 + #define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; })
 + #define HPAGE_PMD_MASK ({ BUILD_BUG(); 0; })
 +diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h
 +index b35968ee9fb5..c2cf146ea880 100644
 +--- a/include/linux/list_lru.h
 ++++ b/include/linux/list_lru.h
 +@@ -89,6 +89,18 @@ void memcg_reparent_list_lrus(struct mem_cgroup *memcg, struct mem_cgroup *paren
 +  */
 + bool list_lru_add(struct list_lru *lru, struct list_head *item);
 + 
 ++/**
 ++ * list_lru_add_page: add an element to the lru list's tail
 ++ * @list_lru: the lru pointer
 ++ * @page: the page containing the item
 ++ * @item: the item to be deleted.
 ++ *
 ++ * This function works the same as list_lru_add in terms of list
 ++ * manipulation. Used for non slab objects contained in the page.
 ++ *
 ++ * Return value: true if the list was updated, false otherwise
 ++ */
 ++bool list_lru_add_page(struct list_lru *lru, struct page *page, struct list_head *item);
 + /**
 +  * list_lru_del: delete an element to the lru list
 +  * @list_lru: the lru pointer
 +@@ -102,6 +114,18 @@ bool list_lru_add(struct list_lru *lru, struct list_head *item);
 +  */
 + bool list_lru_del(struct list_lru *lru, struct list_head *item);
 + 
 ++/**
 ++ * list_lru_del_page: delete an element to the lru list
 ++ * @list_lru: the lru pointer
 ++ * @page: the page containing the item
 ++ * @item: the item to be deleted.
 ++ *
 ++ * This function works the same as list_lru_del in terms of list
 ++ * manipulation. Used for non slab objects contained in the page.
 ++ *
 ++ * Return value: true if the list was updated, false otherwise
 ++ */
 ++bool list_lru_del_page(struct list_lru *lru, struct page *page, struct list_head *item);
 + /**
 +  * list_lru_count_one: return the number of objects currently held by @lru
 +  * @lru: the lru pointer.
 +diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
 +index 5e32211cb5a9..a2a26fc8e89f 100644
 +--- a/include/linux/mm_types.h
 ++++ b/include/linux/mm_types.h
 +@@ -152,6 +152,11 @@ struct page {
 + 			/* For both global and memcg */
 + 			struct list_head deferred_list;
 + 		};
 ++		struct { /* Third tail page of compound page */
 ++			unsigned long _compound_pad_3; /* compound_head */
 ++			unsigned long _compound_pad_4;
 ++			struct list_head underutilized_thp_list;
 ++		};
 + 		struct {	/* Page table pages */
 + 			unsigned long _pt_pad_1;	/* compound_head */
 + 			pgtable_t pmd_huge_pte; /* protected by page->ptl */
 +diff --git a/include/linux/rmap.h b/include/linux/rmap.h
 +index b89b4b86951f..f7d5d5639dea 100644
 +--- a/include/linux/rmap.h
 ++++ b/include/linux/rmap.h
 +@@ -372,7 +372,7 @@ int folio_mkclean(struct folio *);
 + int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff,
 + 		      struct vm_area_struct *vma);
 + 
 +-void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked);
 ++void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked, bool unmap_clean);
 + 
 + int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma);
 + 
 +diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
 +index 3518dba1e02f..3618b10ddec9 100644
 +--- a/include/linux/vm_event_item.h
 ++++ b/include/linux/vm_event_item.h
 +@@ -111,6 +111,9 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
 + #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
 + 		THP_SPLIT_PUD,
 + #endif
 ++		THP_SPLIT_FREE,
 ++		THP_SPLIT_UNMAP,
 ++		THP_SPLIT_REMAP_READONLY_ZERO_PAGE,
 + 		THP_ZERO_PAGE_ALLOC,
 + 		THP_ZERO_PAGE_ALLOC_FAILED,
 + 		THP_SWPOUT,
 +diff --git a/mm/huge_memory.c b/mm/huge_memory.c
 +index 51f8e41b6568..05428ae7cf2d 100644
 +--- a/mm/huge_memory.c
 ++++ b/mm/huge_memory.c
 +@@ -46,6 +46,16 @@
 + #define CREATE_TRACE_POINTS
 + #include <trace/events/thp.h>
 + 
 ++/*
 ++ * The number of utilization buckets THPs will be grouped in
 ++ * under /sys/kernel/debug/thp_utilization.
 ++ */
 ++#define THP_UTIL_BUCKET_NR 10
 ++/*
 ++ * The number of PFNs (and hence hugepages) to scan through on each periodic
 ++ * run of the scanner that generates /sys/kernel/debug/thp_utilization.
 ++ */
 ++#define THP_UTIL_SCAN_SIZE 256
 + /*
 +  * By default, transparent hugepage support is disabled in order to avoid
 +  * risking an increased memory footprint for applications that are not
 +@@ -71,6 +81,27 @@ static atomic_t huge_zero_refcount;
 + struct page *huge_zero_page __read_mostly;
 + unsigned long huge_zero_pfn __read_mostly = ~0UL;
 + 
 ++struct list_lru huge_low_util_page_lru;
 ++
 ++static void thp_utilization_workfn(struct work_struct *work);
 ++static DECLARE_DELAYED_WORK(thp_utilization_work, thp_utilization_workfn);
 ++
 ++struct thp_scan_info_bucket {
 ++	int nr_thps;
 ++	int nr_zero_pages;
 ++};
 ++
 ++struct thp_scan_info {
 ++	struct thp_scan_info_bucket buckets[THP_UTIL_BUCKET_NR];
 ++	struct zone *scan_zone;
 ++	struct timespec64 last_scan_duration;
 ++	struct timespec64 last_scan_time;
 ++	unsigned long pfn;
 ++};
 ++
 ++static struct thp_scan_info thp_scan_debugfs;
 ++static struct thp_scan_info thp_scan;
 ++
 + bool hugepage_vma_check(struct vm_area_struct *vma, unsigned long vm_flags,
 + 			bool smaps, bool in_pf, bool enforce_sysfs)
 + {
 +@@ -234,6 +265,51 @@ static struct shrinker huge_zero_page_shrinker = {
 + 	.seeks = DEFAULT_SEEKS,
 + };
 + 
 ++static enum lru_status low_util_free_page(struct list_head *item,
 ++					  struct list_lru_one *lru,
 ++					  spinlock_t *lock,
 ++					  void *cb_arg)
 ++{
 ++	int bucket, num_utilized_pages;
 ++	struct page *head = compound_head(list_entry(item,
 ++									struct page,
 ++									underutilized_thp_list));
 ++
 ++	if (get_page_unless_zero(head)) {
 ++		lock_page(head);
 ++		list_lru_isolate(lru, item);
 ++		num_utilized_pages = thp_number_utilized_pages(head);
 ++		bucket = thp_utilization_bucket(num_utilized_pages);
 ++		if (bucket < THP_UTIL_BUCKET_NR - 1)
 ++			split_huge_page(head);
 ++		unlock_page(head);
 ++		put_page(head);
 ++	}
 ++
 ++	return LRU_REMOVED_RETRY;
 ++}
 ++
 ++static unsigned long shrink_huge_low_util_page_count(struct shrinker *shrink,
 ++						     struct shrink_control *sc)
 ++{
 ++	return HPAGE_PMD_NR * list_lru_shrink_count(&huge_low_util_page_lru, sc);
 ++}
 ++
 ++static unsigned long shrink_huge_low_util_page_scan(struct shrinker *shrink,
 ++						    struct shrink_control *sc)
 ++{
 ++	return HPAGE_PMD_NR * list_lru_shrink_walk(&huge_low_util_page_lru,
 ++							sc, low_util_free_page, NULL);
 ++}
 ++
 ++static struct shrinker huge_low_util_page_shrinker = {
 ++	.count_objects = shrink_huge_low_util_page_count,
 ++	.scan_objects = shrink_huge_low_util_page_scan,
 ++	.seeks = DEFAULT_SEEKS,
 ++	.flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE |
 ++		SHRINKER_NONSLAB,
 ++};
 ++
 + #ifdef CONFIG_SYSFS
 + static ssize_t enabled_show(struct kobject *kobj,
 + 			    struct kobj_attribute *attr, char *buf)
 +@@ -485,13 +561,19 @@ static int __init hugepage_init(void)
 + 	if (err)
 + 		goto err_slab;
 + 
 ++	schedule_delayed_work(&thp_utilization_work, HZ);
 ++	err = register_shrinker(&huge_low_util_page_shrinker, "thp-low-util");
 ++	if (err)
 ++		goto err_low_util_shrinker;
 + 	err = register_shrinker(&huge_zero_page_shrinker, "thp-zero");
 + 	if (err)
 + 		goto err_hzp_shrinker;
 + 	err = register_shrinker(&deferred_split_shrinker, "thp-deferred_split");
 + 	if (err)
 + 		goto err_split_shrinker;
 +-
 ++	err = list_lru_init_memcg(&huge_low_util_page_lru, &huge_low_util_page_shrinker);
 ++	if (err)
 ++		goto err_low_util_list_lru;
 + 	/*
 + 	 * By default disable transparent hugepages on smaller systems,
 + 	 * where the extra memory used could hurt more than TLB overhead
 +@@ -507,11 +589,16 @@ static int __init hugepage_init(void)
 + 		goto err_khugepaged;
 + 
 + 	return 0;
 ++
 + err_khugepaged:
 ++	list_lru_destroy(&huge_low_util_page_lru);
 ++err_low_util_list_lru:
 + 	unregister_shrinker(&deferred_split_shrinker);
 + err_split_shrinker:
 + 	unregister_shrinker(&huge_zero_page_shrinker);
 + err_hzp_shrinker:
 ++	unregister_shrinker(&huge_low_util_page_shrinker);
 ++err_low_util_shrinker:
 + 	khugepaged_destroy();
 + err_slab:
 + 	hugepage_exit_sysfs(hugepage_kobj);
 +@@ -586,6 +673,7 @@ void prep_transhuge_page(struct page *page)
 + 	 */
 + 
 + 	INIT_LIST_HEAD(page_deferred_list(page));
 ++	INIT_LIST_HEAD(page_underutilized_thp_list(page));
 + 	set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR);
 + }
 + 
 +@@ -599,6 +687,11 @@ static inline bool is_transparent_hugepage(struct page *page)
 + 	       page[1].compound_dtor == TRANSHUGE_PAGE_DTOR;
 + }
 + 
 ++static inline bool is_anon_transparent_hugepage(struct page *page)
 ++{
 ++	return PageAnon(page) && is_transparent_hugepage(page);
 ++}
 ++
 + static unsigned long __thp_get_unmapped_area(struct file *filp,
 + 		unsigned long addr, unsigned long len,
 + 		loff_t off, unsigned long flags, unsigned long size)
 +@@ -649,6 +742,49 @@ unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
 + }
 + EXPORT_SYMBOL_GPL(thp_get_unmapped_area);
 + 
 ++int thp_number_utilized_pages(struct page *page)
 ++{
 ++	struct folio *folio;
 ++	unsigned long page_offset, value;
 ++	int thp_nr_utilized_pages = HPAGE_PMD_NR;
 ++	int step_size = sizeof(unsigned long);
 ++	bool is_all_zeroes;
 ++	void *kaddr;
 ++	int i;
 ++
 ++	if (!page || !is_anon_transparent_hugepage(page))
 ++		return -1;
 ++
 ++	folio = page_folio(page);
 ++	for (i = 0; i < folio_nr_pages(folio); i++) {
 ++		kaddr = kmap_local_folio(folio, i);
 ++		is_all_zeroes = true;
 ++		for (page_offset = 0; page_offset < PAGE_SIZE; page_offset += step_size) {
 ++			value = *(unsigned long *)(kaddr + page_offset);
 ++			if (value != 0) {
 ++				is_all_zeroes = false;
 ++				break;
 ++			}
 ++		}
 ++		if (is_all_zeroes)
 ++			thp_nr_utilized_pages--;
 ++
 ++		kunmap_local(kaddr);
 ++	}
 ++	return thp_nr_utilized_pages;
 ++}
 ++
 ++int thp_utilization_bucket(int num_utilized_pages)
 ++{
 ++	int bucket;
 ++
 ++	if (num_utilized_pages < 0 || num_utilized_pages > HPAGE_PMD_NR)
 ++		return -1;
 ++	/* Group THPs into utilization buckets */
 ++	bucket = num_utilized_pages * THP_UTIL_BUCKET_NR / HPAGE_PMD_NR;
 ++	return min(bucket, THP_UTIL_BUCKET_NR - 1);
 ++}
 ++
 + static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
 + 			struct page *page, gfp_t gfp)
 + {
 +@@ -2349,7 +2485,7 @@ static void unmap_page(struct page *page)
 + 		try_to_unmap(folio, ttu_flags | TTU_IGNORE_MLOCK);
 + }
 + 
 +-static void remap_page(struct folio *folio, unsigned long nr)
 ++static void remap_page(struct folio *folio, unsigned long nr, bool unmap_clean)
 + {
 + 	int i = 0;
 + 
 +@@ -2357,7 +2493,7 @@ static void remap_page(struct folio *folio, unsigned long nr)
 + 	if (!folio_test_anon(folio))
 + 		return;
 + 	for (;;) {
 +-		remove_migration_ptes(folio, folio, true);
 ++		remove_migration_ptes(folio, folio, true, unmap_clean);
 + 		i += folio_nr_pages(folio);
 + 		if (i >= nr)
 + 			break;
 +@@ -2427,8 +2563,7 @@ static void __split_huge_page_tail(struct page *head, int tail,
 + 			 LRU_GEN_MASK | LRU_REFS_MASK));
 + 
 + 	/* ->mapping in first tail page is compound_mapcount */
 +-	VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING,
 +-			page_tail);
 ++	VM_BUG_ON_PAGE(tail > 3 && page_tail->mapping != TAIL_MAPPING, page_tail);
 + 	page_tail->mapping = head->mapping;
 + 	page_tail->index = head->index + tail;
 + 	page_tail->private = 0;
 +@@ -2472,6 +2607,8 @@ static void __split_huge_page(struct page *page, struct list_head *list,
 + 	struct address_space *swap_cache = NULL;
 + 	unsigned long offset = 0;
 + 	unsigned int nr = thp_nr_pages(head);
 ++	LIST_HEAD(pages_to_free);
 ++	int nr_pages_to_free = 0;
 + 	int i;
 + 
 + 	/* complete memcg works before add pages to LRU */
 +@@ -2534,7 +2671,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
 + 	}
 + 	local_irq_enable();
 + 
 +-	remap_page(folio, nr);
 ++	remap_page(folio, nr, PageAnon(head));
 + 
 + 	if (PageSwapCache(head)) {
 + 		swp_entry_t entry = { .val = page_private(head) };
 +@@ -2548,6 +2685,33 @@ static void __split_huge_page(struct page *page, struct list_head *list,
 + 			continue;
 + 		unlock_page(subpage);
 + 
 ++		/*
 ++		 * If a tail page has only two references left, one inherited
 ++		 * from the isolation of its head and the other from
 ++		 * lru_add_page_tail() which we are about to drop, it means this
 ++		 * tail page was concurrently zapped. Then we can safely free it
 ++		 * and save page reclaim or migration the trouble of trying it.
 ++		 */
 ++		if (list && page_ref_freeze(subpage, 2)) {
 ++			VM_BUG_ON_PAGE(PageLRU(subpage), subpage);
 ++			VM_BUG_ON_PAGE(PageCompound(subpage), subpage);
 ++			VM_BUG_ON_PAGE(page_mapped(subpage), subpage);
 ++
 ++			ClearPageActive(subpage);
 ++			ClearPageUnevictable(subpage);
 ++			list_move(&subpage->lru, &pages_to_free);
 ++			nr_pages_to_free++;
 ++			continue;
 ++		}
 ++		/*
 ++		 * If a tail page has only one reference left, it will be freed
 ++		 * by the call to free_page_and_swap_cache below. Since zero
 ++		 * subpages are no longer remapped, there will only be one
 ++		 * reference left in cases outside of reclaim or migration.
 ++		 */
 ++		if (page_ref_count(subpage) == 1)
 ++			nr_pages_to_free++;
 ++
 + 		/*
 + 		 * Subpages may be freed if there wasn't any mapping
 + 		 * like if add_to_swap() is running on a lru page that
 +@@ -2557,6 +2721,13 @@ static void __split_huge_page(struct page *page, struct list_head *list,
 + 		 */
 + 		free_page_and_swap_cache(subpage);
 + 	}
 ++
 ++	if (!nr_pages_to_free)
 ++		return;
 ++
 ++	mem_cgroup_uncharge_list(&pages_to_free);
 ++	free_unref_page_list(&pages_to_free);
 ++	count_vm_events(THP_SPLIT_FREE, nr_pages_to_free);
 + }
 + 
 + /* Racy check whether the huge page can be split */
 +@@ -2599,6 +2770,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
 + 	struct folio *folio = page_folio(page);
 + 	struct page *head = &folio->page;
 + 	struct deferred_split *ds_queue = get_deferred_split_queue(head);
 ++	struct list_head *underutilized_thp_list = page_underutilized_thp_list(head);
 + 	XA_STATE(xas, &head->mapping->i_pages, head->index);
 + 	struct anon_vma *anon_vma = NULL;
 + 	struct address_space *mapping = NULL;
 +@@ -2697,6 +2869,8 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
 + 			list_del(page_deferred_list(head));
 + 		}
 + 		spin_unlock(&ds_queue->split_queue_lock);
 ++		if (!list_empty(underutilized_thp_list))
 ++			list_lru_del_page(&huge_low_util_page_lru, head, underutilized_thp_list);
 + 		if (mapping) {
 + 			int nr = thp_nr_pages(head);
 + 
 +@@ -2719,7 +2893,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
 + 		if (mapping)
 + 			xas_unlock(&xas);
 + 		local_irq_enable();
 +-		remap_page(folio, folio_nr_pages(folio));
 ++		remap_page(folio, folio_nr_pages(folio), false);
 + 		ret = -EBUSY;
 + 	}
 + 
 +@@ -2739,6 +2913,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
 + void free_transhuge_page(struct page *page)
 + {
 + 	struct deferred_split *ds_queue = get_deferred_split_queue(page);
 ++	struct list_head *underutilized_thp_list = page_underutilized_thp_list(page);
 + 	unsigned long flags;
 + 
 + 	spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
 +@@ -2747,6 +2922,12 @@ void free_transhuge_page(struct page *page)
 + 		list_del(page_deferred_list(page));
 + 	}
 + 	spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
 ++	if (!list_empty(underutilized_thp_list))
 ++		list_lru_del_page(&huge_low_util_page_lru, page, underutilized_thp_list);
 ++
 ++	if (PageLRU(page))
 ++		__clear_page_lru_flags(page);
 ++
 + 	free_compound_page(page);
 + }
 + 
 +@@ -2787,6 +2968,26 @@ void deferred_split_huge_page(struct page *page)
 + 	spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
 + }
 + 
 ++void add_underutilized_thp(struct page *page)
 ++{
 ++	VM_BUG_ON_PAGE(!PageTransHuge(page), page);
 ++
 ++	if (PageSwapCache(page))
 ++		return;
 ++
 ++	/*
 ++	 * Need to take a reference on the page to prevent the page from getting free'd from
 ++	 * under us while we are adding the THP to the shrinker.
 ++	 */
 ++	if (!get_page_unless_zero(page))
 ++		return;
 ++
 ++	if (!is_huge_zero_page(page) && is_anon_transparent_hugepage(page))
 ++		list_lru_add_page(&huge_low_util_page_lru, page, page_underutilized_thp_list(page));
 ++
 ++	put_page(page);
 ++}
 ++
 + static unsigned long deferred_split_count(struct shrinker *shrink,
 + 		struct shrink_control *sc)
 + {
 +@@ -3141,6 +3342,42 @@ static int __init split_huge_pages_debugfs(void)
 + 	return 0;
 + }
 + late_initcall(split_huge_pages_debugfs);
 ++
 ++static int thp_utilization_show(struct seq_file *seqf, void *pos)
 ++{
 ++	int i;
 ++	int start;
 ++	int end;
 ++
 ++	for (i = 0; i < THP_UTIL_BUCKET_NR; i++) {
 ++		start = i * HPAGE_PMD_NR / THP_UTIL_BUCKET_NR;
 ++		end = (i + 1 == THP_UTIL_BUCKET_NR)
 ++			   ? HPAGE_PMD_NR
 ++			   : ((i + 1) * HPAGE_PMD_NR / THP_UTIL_BUCKET_NR - 1);
 ++		/* The last bucket will need to contain 100 */
 ++		seq_printf(seqf, "Utilized[%d-%d]: %d %d\n", start, end,
 ++			   thp_scan_debugfs.buckets[i].nr_thps,
 ++			   thp_scan_debugfs.buckets[i].nr_zero_pages);
 ++	}
 ++	seq_printf(seqf, "Last Scan Time: %lu.%02lus\n",
 ++		   (unsigned long)thp_scan_debugfs.last_scan_time.tv_sec,
 ++		   (thp_scan_debugfs.last_scan_time.tv_nsec / (NSEC_PER_SEC / 100)));
 ++
 ++	seq_printf(seqf, "Last Scan Duration: %lu.%02lus\n",
 ++		   (unsigned long)thp_scan_debugfs.last_scan_duration.tv_sec,
 ++		   (thp_scan_debugfs.last_scan_duration.tv_nsec / (NSEC_PER_SEC / 100)));
 ++
 ++	return 0;
 ++}
 ++DEFINE_SHOW_ATTRIBUTE(thp_utilization);
 ++
 ++static int __init thp_utilization_debugfs(void)
 ++{
 ++	debugfs_create_file("thp_utilization", 0200, NULL, NULL,
 ++			    &thp_utilization_fops);
 ++	return 0;
 ++}
 ++late_initcall(thp_utilization_debugfs);
 + #endif
 + 
 + #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
 +@@ -3226,3 +3463,94 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new)
 + 	trace_remove_migration_pmd(address, pmd_val(pmde));
 + }
 + #endif
 ++
 ++static void thp_scan_next_zone(void)
 ++{
 ++	struct timespec64 current_time;
 ++	int i;
 ++	bool update_debugfs;
 ++	/*
 ++	 * THP utilization worker thread has reached the end
 ++	 * of the memory zone. Proceed to the next zone.
 ++	 */
 ++	thp_scan.scan_zone = next_zone(thp_scan.scan_zone);
 ++	update_debugfs = !thp_scan.scan_zone;
 ++	thp_scan.scan_zone = update_debugfs ? (first_online_pgdat())->node_zones
 ++			: thp_scan.scan_zone;
 ++	thp_scan.pfn = (thp_scan.scan_zone->zone_start_pfn + HPAGE_PMD_NR - 1)
 ++			& ~(HPAGE_PMD_SIZE - 1);
 ++	if (!update_debugfs)
 ++		return;
 ++	/*
 ++	 * If the worker has scanned through all of physical
 ++	 * memory. Then update information displayed in /sys/kernel/debug/thp_utilization
 ++	 */
 ++	ktime_get_ts64(&current_time);
 ++	thp_scan_debugfs.last_scan_duration = timespec64_sub(current_time,
 ++							     thp_scan_debugfs.last_scan_time);
 ++	thp_scan_debugfs.last_scan_time = current_time;
 ++
 ++	for (i = 0; i < THP_UTIL_BUCKET_NR; i++) {
 ++		thp_scan_debugfs.buckets[i].nr_thps = thp_scan.buckets[i].nr_thps;
 ++		thp_scan_debugfs.buckets[i].nr_zero_pages = thp_scan.buckets[i].nr_zero_pages;
 ++		thp_scan.buckets[i].nr_thps = 0;
 ++		thp_scan.buckets[i].nr_zero_pages = 0;
 ++	}
 ++}
 ++
 ++static void thp_util_scan(unsigned long pfn_end)
 ++{
 ++	struct page *page = NULL;
 ++	int bucket, num_utilized_pages, current_pfn;
 ++	int i;
 ++	/*
 ++	 * Scan through each memory zone in chunks of THP_UTIL_SCAN_SIZE
 ++	 * PFNs every second looking for anonymous THPs.
 ++	 */
 ++	for (i = 0; i < THP_UTIL_SCAN_SIZE; i++) {
 ++		current_pfn = thp_scan.pfn;
 ++		thp_scan.pfn += HPAGE_PMD_NR;
 ++		if (current_pfn >= pfn_end)
 ++			return;
 ++
 ++		if (!pfn_valid(current_pfn))
 ++			continue;
 ++
 ++		page = pfn_to_page(current_pfn);
 ++		num_utilized_pages = thp_number_utilized_pages(page);
 ++		bucket = thp_utilization_bucket(num_utilized_pages);
 ++		if (bucket < 0)
 ++			continue;
 ++
 ++		if (bucket < THP_UTIL_BUCKET_NR - 1)
 ++			add_underutilized_thp(page);
 ++
 ++		thp_scan.buckets[bucket].nr_thps++;
 ++		thp_scan.buckets[bucket].nr_zero_pages += (HPAGE_PMD_NR - num_utilized_pages);
 ++	}
 ++}
 ++
 ++static void thp_utilization_workfn(struct work_struct *work)
 ++{
 ++	unsigned long pfn_end;
 ++
 ++	if (!thp_scan.scan_zone)
 ++		thp_scan.scan_zone = (first_online_pgdat())->node_zones;
 ++	/*
 ++	 * Worker function that scans through all of physical memory
 ++	 * for anonymous THPs.
 ++	 */
 ++	pfn_end = (thp_scan.scan_zone->zone_start_pfn +
 ++			thp_scan.scan_zone->spanned_pages + HPAGE_PMD_NR - 1)
 ++			& ~(HPAGE_PMD_SIZE - 1);
 ++	/* If we have reached the end of the zone or end of physical memory
 ++	 * move on to the next zone. Otherwise, scan the next PFNs in the
 ++	 * current zone.
 ++	 */
 ++	if (!populated_zone(thp_scan.scan_zone) || thp_scan.pfn >= pfn_end)
 ++		thp_scan_next_zone();
 ++	else
 ++		thp_util_scan(pfn_end);
 ++
 ++	schedule_delayed_work(&thp_utilization_work, HZ);
 ++}
 +diff --git a/mm/list_lru.c b/mm/list_lru.c
 +index a05e5bef3b40..7e8b324cc840 100644
 +--- a/mm/list_lru.c
 ++++ b/mm/list_lru.c
 +@@ -140,6 +140,32 @@ bool list_lru_add(struct list_lru *lru, struct list_head *item)
 + }
 + EXPORT_SYMBOL_GPL(list_lru_add);
 + 
 ++bool list_lru_add_page(struct list_lru *lru, struct page *page, struct list_head *item)
 ++{
 ++	int nid = page_to_nid(page);
 ++	struct list_lru_node *nlru = &lru->node[nid];
 ++	struct list_lru_one *l;
 ++	struct mem_cgroup *memcg;
 ++
 ++	spin_lock(&nlru->lock);
 ++	if (list_empty(item)) {
 ++		memcg = page_memcg(page);
 ++		memcg_list_lru_alloc(memcg, lru, GFP_KERNEL);
 ++		l = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg));
 ++		list_add_tail(item, &l->list);
 ++		/* Set shrinker bit if the first element was added */
 ++		if (!l->nr_items++)
 ++			set_shrinker_bit(memcg, nid,
 ++					 lru_shrinker_id(lru));
 ++		nlru->nr_items++;
 ++		spin_unlock(&nlru->lock);
 ++		return true;
 ++	}
 ++	spin_unlock(&nlru->lock);
 ++	return false;
 ++}
 ++EXPORT_SYMBOL_GPL(list_lru_add_page);
 ++
 + bool list_lru_del(struct list_lru *lru, struct list_head *item)
 + {
 + 	int nid = page_to_nid(virt_to_page(item));
 +@@ -160,6 +186,29 @@ bool list_lru_del(struct list_lru *lru, struct list_head *item)
 + }
 + EXPORT_SYMBOL_GPL(list_lru_del);
 + 
 ++bool list_lru_del_page(struct list_lru *lru, struct page *page, struct list_head *item)
 ++{
 ++	int nid = page_to_nid(page);
 ++	struct list_lru_node *nlru = &lru->node[nid];
 ++	struct list_lru_one *l;
 ++	struct mem_cgroup *memcg;
 ++
 ++	spin_lock(&nlru->lock);
 ++	if (!list_empty(item)) {
 ++		memcg = page_memcg(page);
 ++		memcg_list_lru_alloc(memcg, lru, GFP_KERNEL);
 ++		l = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg));
 ++		list_del_init(item);
 ++		l->nr_items--;
 ++		nlru->nr_items--;
 ++		spin_unlock(&nlru->lock);
 ++		return true;
 ++	}
 ++	spin_unlock(&nlru->lock);
 ++	return false;
 ++}
 ++EXPORT_SYMBOL_GPL(list_lru_del_page);
 ++
 + void list_lru_isolate(struct list_lru_one *list, struct list_head *item)
 + {
 + 	list_del_init(item);
 +diff --git a/mm/migrate.c b/mm/migrate.c
 +index 55e7718cfe45..57908d680276 100644
 +--- a/mm/migrate.c
 ++++ b/mm/migrate.c
 +@@ -168,13 +168,62 @@ void putback_movable_pages(struct list_head *l)
 + 	}
 + }
 + 
 ++static bool try_to_unmap_clean(struct page_vma_mapped_walk *pvmw, struct page *page)
 ++{
 ++	void *addr;
 ++	bool dirty;
 ++	pte_t newpte;
 ++
 ++	VM_BUG_ON_PAGE(PageCompound(page), page);
 ++	VM_BUG_ON_PAGE(!PageAnon(page), page);
 ++	VM_BUG_ON_PAGE(!PageLocked(page), page);
 ++	VM_BUG_ON_PAGE(pte_present(*pvmw->pte), page);
 ++
 ++	if (PageMlocked(page) || (pvmw->vma->vm_flags & VM_LOCKED))
 ++		return false;
 ++
 ++	/*
 ++	 * The pmd entry mapping the old thp was flushed and the pte mapping
 ++	 * this subpage has been non present. Therefore, this subpage is
 ++	 * inaccessible. We don't need to remap it if it contains only zeros.
 ++	 */
 ++	addr = kmap_local_page(page);
 ++	dirty = memchr_inv(addr, 0, PAGE_SIZE);
 ++	kunmap_local(addr);
 ++
 ++	if (dirty)
 ++		return false;
 ++
 ++	pte_clear_not_present_full(pvmw->vma->vm_mm, pvmw->address, pvmw->pte, false);
 ++
 ++	if (userfaultfd_armed(pvmw->vma)) {
 ++		newpte = pte_mkspecial(pfn_pte(page_to_pfn(ZERO_PAGE(pvmw->address)),
 ++					       pvmw->vma->vm_page_prot));
 ++		ptep_clear_flush(pvmw->vma, pvmw->address, pvmw->pte);
 ++		set_pte_at(pvmw->vma->vm_mm, pvmw->address, pvmw->pte, newpte);
 ++		dec_mm_counter(pvmw->vma->vm_mm, MM_ANONPAGES);
 ++		count_vm_event(THP_SPLIT_REMAP_READONLY_ZERO_PAGE);
 ++		return true;
 ++	}
 ++
 ++	dec_mm_counter(pvmw->vma->vm_mm, mm_counter(page));
 ++	count_vm_event(THP_SPLIT_UNMAP);
 ++	return true;
 ++}
 ++
 ++struct rmap_walk_arg {
 ++	struct folio *folio;
 ++	bool unmap_clean;
 ++};
 ++
 + /*
 +  * Restore a potential migration pte to a working pte entry
 +  */
 + static bool remove_migration_pte(struct folio *folio,
 +-		struct vm_area_struct *vma, unsigned long addr, void *old)
 ++		struct vm_area_struct *vma, unsigned long addr, void *arg)
 + {
 +-	DEFINE_FOLIO_VMA_WALK(pvmw, old, vma, addr, PVMW_SYNC | PVMW_MIGRATION);
 ++	struct rmap_walk_arg *rmap_walk_arg = arg;
 ++	DEFINE_FOLIO_VMA_WALK(pvmw, rmap_walk_arg->folio, vma, addr, PVMW_SYNC | PVMW_MIGRATION);
 + 
 + 	while (page_vma_mapped_walk(&pvmw)) {
 + 		rmap_t rmap_flags = RMAP_NONE;
 +@@ -197,6 +246,8 @@ static bool remove_migration_pte(struct folio *folio,
 + 			continue;
 + 		}
 + #endif
 ++		if (rmap_walk_arg->unmap_clean && try_to_unmap_clean(&pvmw, new))
 ++			continue;
 + 
 + 		folio_get(folio);
 + 		pte = pte_mkold(mk_pte(new, READ_ONCE(vma->vm_page_prot)));
 +@@ -268,13 +319,20 @@ static bool remove_migration_pte(struct folio *folio,
 +  * Get rid of all migration entries and replace them by
 +  * references to the indicated page.
 +  */
 +-void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked)
 ++void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked, bool unmap_clean)
 + {
 ++	struct rmap_walk_arg rmap_walk_arg = {
 ++		.folio = src,
 ++		.unmap_clean = unmap_clean,
 ++	};
 ++
 + 	struct rmap_walk_control rwc = {
 + 		.rmap_one = remove_migration_pte,
 +-		.arg = src,
 ++		.arg = &rmap_walk_arg,
 + 	};
 + 
 ++	VM_BUG_ON_FOLIO(unmap_clean && src != dst, src);
 ++
 + 	if (locked)
 + 		rmap_walk_locked(dst, &rwc);
 + 	else
 +@@ -850,7 +908,7 @@ static int writeout(struct address_space *mapping, struct folio *folio)
 + 	 * At this point we know that the migration attempt cannot
 + 	 * be successful.
 + 	 */
 +-	remove_migration_ptes(folio, folio, false);
 ++	remove_migration_ptes(folio, folio, false, false);
 + 
 + 	rc = mapping->a_ops->writepage(&folio->page, &wbc);
 + 
 +@@ -1109,7 +1167,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
 + 
 + 	if (page_was_mapped)
 + 		remove_migration_ptes(folio,
 +-			rc == MIGRATEPAGE_SUCCESS ? dst : folio, false);
 ++			rc == MIGRATEPAGE_SUCCESS ? dst : folio, false, false);
 + 
 + out_unlock_both:
 + 	unlock_page(newpage);
 +@@ -1319,7 +1377,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
 + 
 + 	if (page_was_mapped)
 + 		remove_migration_ptes(src,
 +-			rc == MIGRATEPAGE_SUCCESS ? dst : src, false);
 ++			rc == MIGRATEPAGE_SUCCESS ? dst : src, false, false);
 + 
 + unlock_put_anon:
 + 	unlock_page(new_hpage);
 +diff --git a/mm/migrate_device.c b/mm/migrate_device.c
 +index dbf6c7a7a7c9..518aacc914c9 100644
 +--- a/mm/migrate_device.c
 ++++ b/mm/migrate_device.c
 +@@ -413,7 +413,7 @@ static void migrate_vma_unmap(struct migrate_vma *migrate)
 + 			continue;
 + 
 + 		folio = page_folio(page);
 +-		remove_migration_ptes(folio, folio, false);
 ++		remove_migration_ptes(folio, folio, false, false);
 + 
 + 		migrate->src[i] = 0;
 + 		folio_unlock(folio);
 +@@ -789,7 +789,7 @@ void migrate_vma_finalize(struct migrate_vma *migrate)
 + 
 + 		src = page_folio(page);
 + 		dst = page_folio(newpage);
 +-		remove_migration_ptes(src, dst, false);
 ++		remove_migration_ptes(src, dst, false, false);
 + 		folio_unlock(src);
 + 
 + 		if (is_zone_device_page(page))
 +diff --git a/mm/page_alloc.c b/mm/page_alloc.c
 +index 65ffd285db54..8536bb6f655b 100644
 +--- a/mm/page_alloc.c
 ++++ b/mm/page_alloc.c
 +@@ -1328,6 +1328,12 @@ static int free_tail_pages_check(struct page *head_page, struct page *page)
 + 		 * deferred_list.next -- ignore value.
 + 		 */
 + 		break;
 ++	case 3:
 ++		/*
 ++		 * the third tail page: ->mapping is
 ++		 * underutilized_thp_list.next -- ignore value.
 ++		 */
 ++		break;
 + 	default:
 + 		if (page->mapping != TAIL_MAPPING) {
 + 			bad_page(page, "corrupted mapping in tail page");
 +diff --git a/mm/vmstat.c b/mm/vmstat.c
 +index 33091a67627e..f6c5d0e97499 100644
 +--- a/mm/vmstat.c
 ++++ b/mm/vmstat.c
 +@@ -1369,6 +1369,9 @@ const char * const vmstat_text[] = {
 + #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
 + 	"thp_split_pud",
 + #endif
 ++	"thp_split_free",
 ++	"thp_split_unmap",
 ++	"thp_split_remap_readonly_zero_page",
 + 	"thp_zero_page_alloc",
 + 	"thp_zero_page_alloc_failed",
 + 	"thp_swpout",
 +diff --git a/tools/testing/selftests/vm/split_huge_page_test.c b/tools/testing/selftests/vm/split_huge_page_test.c
 +index 6aa2b8253aed..2c669aadbfd0 100644
 +--- a/tools/testing/selftests/vm/split_huge_page_test.c
 ++++ b/tools/testing/selftests/vm/split_huge_page_test.c
 +@@ -16,6 +16,9 @@
 + #include <sys/mount.h>
 + #include <malloc.h>
 + #include <stdbool.h>
 ++#include <sys/syscall.h> /* Definition of SYS_* constants */
 ++#include <linux/userfaultfd.h>
 ++#include <sys/ioctl.h>
 + #include "vm_util.h"
 + 
 + uint64_t pagesize;
 +@@ -88,6 +91,113 @@ static void write_debugfs(const char *fmt, ...)
 + 	}
 + }
 + 
 ++static char *allocate_zero_filled_hugepage(size_t len)
 ++{
 ++	char *result;
 ++	size_t i;
 ++
 ++	result = memalign(pmd_pagesize, len);
 ++	if (!result) {
 ++		printf("Fail to allocate memory\n");
 ++		exit(EXIT_FAILURE);
 ++	}
 ++	madvise(result, len, MADV_HUGEPAGE);
 ++
 ++	for (i = 0; i < len; i++)
 ++		result[i] = (char)0;
 ++
 ++	return result;
 ++}
 ++
 ++static void verify_rss_anon_split_huge_page_all_zeroes(char *one_page, size_t len)
 ++{
 ++	uint64_t thp_size, rss_anon_before, rss_anon_after;
 ++	size_t i;
 ++
 ++	thp_size = check_huge(one_page);
 ++	if (!thp_size) {
 ++		printf("No THP is allocated\n");
 ++		exit(EXIT_FAILURE);
 ++	}
 ++
 ++	rss_anon_before = rss_anon();
 ++	if (!rss_anon_before) {
 ++		printf("No RssAnon is allocated before split\n");
 ++		exit(EXIT_FAILURE);
 ++	}
 ++	/* split all THPs */
 ++	write_debugfs(PID_FMT, getpid(), (uint64_t)one_page,
 ++		      (uint64_t)one_page + len);
 ++
 ++	for (i = 0; i < len; i++)
 ++		if (one_page[i] != (char)0) {
 ++			printf("%ld byte corrupted\n", i);
 ++			exit(EXIT_FAILURE);
 ++		}
 ++
 ++	thp_size = check_huge(one_page);
 ++	if (thp_size) {
 ++		printf("Still %ld kB AnonHugePages not split\n", thp_size);
 ++		exit(EXIT_FAILURE);
 ++	}
 ++
 ++	rss_anon_after = rss_anon();
 ++	if (rss_anon_after >= rss_anon_before) {
 ++		printf("Incorrect RssAnon value. Before: %ld After: %ld\n",
 ++		       rss_anon_before, rss_anon_after);
 ++		exit(EXIT_FAILURE);
 ++	}
 ++}
 ++
 ++void split_pmd_zero_pages(void)
 ++{
 ++	char *one_page;
 ++	size_t len = 4 * pmd_pagesize;
 ++
 ++	one_page = allocate_zero_filled_hugepage(len);
 ++	verify_rss_anon_split_huge_page_all_zeroes(one_page, len);
 ++	printf("Split zero filled huge pages successful\n");
 ++	free(one_page);
 ++}
 ++
 ++void split_pmd_zero_pages_uffd(void)
 ++{
 ++	char *one_page;
 ++	size_t len = 4 * pmd_pagesize;
 ++	long uffd; /* userfaultfd file descriptor */
 ++	struct uffdio_api uffdio_api;
 ++	struct uffdio_register uffdio_register;
 ++
 ++	/* Create and enable userfaultfd object. */
 ++
 ++	uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
 ++	if (uffd == -1) {
 ++		perror("userfaultfd");
 ++		exit(1);
 ++	}
 ++
 ++	uffdio_api.api = UFFD_API;
 ++	uffdio_api.features = 0;
 ++	if (ioctl(uffd, UFFDIO_API, &uffdio_api) == -1) {
 ++		perror("ioctl-UFFDIO_API");
 ++		exit(1);
 ++	}
 ++
 ++	one_page = allocate_zero_filled_hugepage(len);
 ++
 ++	uffdio_register.range.start = (unsigned long)one_page;
 ++	uffdio_register.range.len = len;
 ++	uffdio_register.mode = UFFDIO_REGISTER_MODE_WP;
 ++	if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) == -1) {
 ++		perror("ioctl-UFFDIO_REGISTER");
 ++		exit(1);
 ++	}
 ++
 ++	verify_rss_anon_split_huge_page_all_zeroes(one_page, len);
 ++	printf("Split zero filled huge pages with uffd successful\n");
 ++	free(one_page);
 ++}
 ++
 + void split_pmd_thp(void)
 + {
 + 	char *one_page;
 +@@ -123,7 +233,6 @@ void split_pmd_thp(void)
 + 			exit(EXIT_FAILURE);
 + 		}
 + 
 +-
 + 	thp_size = check_huge(one_page);
 + 	if (thp_size) {
 + 		printf("Still %ld kB AnonHugePages not split\n", thp_size);
 +@@ -305,6 +414,8 @@ int main(int argc, char **argv)
 + 	pageshift = ffs(pagesize) - 1;
 + 	pmd_pagesize = read_pmd_pagesize();
 + 
 ++	split_pmd_zero_pages();
 ++	split_pmd_zero_pages_uffd();
 + 	split_pmd_thp();
 + 	split_pte_mapped_thp();
 + 	split_file_backed_thp();
 +diff --git a/tools/testing/selftests/vm/vm_util.c b/tools/testing/selftests/vm/vm_util.c
 +index b58ab11a7a30..c6a785a67fc9 100644
 +--- a/tools/testing/selftests/vm/vm_util.c
 ++++ b/tools/testing/selftests/vm/vm_util.c
 +@@ -6,6 +6,7 @@
 + 
 + #define PMD_SIZE_FILE_PATH "/sys/kernel/mm/transparent_hugepage/hpage_pmd_size"
 + #define SMAP_FILE_PATH "/proc/self/smaps"
 ++#define STATUS_FILE_PATH "/proc/self/status"
 + #define MAX_LINE_LENGTH 500
 + 
 + uint64_t pagemap_get_entry(int fd, char *start)
 +@@ -72,6 +73,28 @@ uint64_t read_pmd_pagesize(void)
 + 	return strtoul(buf, NULL, 10);
 + }
 + 
 ++uint64_t rss_anon(void)
 ++{
 ++	uint64_t rss_anon = 0;
 ++	int ret;
 ++	FILE *fp;
 ++	char buffer[MAX_LINE_LENGTH];
 ++
 ++	fp = fopen(STATUS_FILE_PATH, "r");
 ++	if (!fp)
 ++		ksft_exit_fail_msg("%s: Failed to open file %s\n", __func__, STATUS_FILE_PATH);
 ++
 ++	if (!check_for_pattern(fp, "RssAnon:", buffer))
 ++		goto err_out;
 ++
 ++	if (sscanf(buffer, "RssAnon:%10ld kB", &rss_anon) != 1)
 ++		ksft_exit_fail_msg("Reading status error\n");
 ++
 ++err_out:
 ++	fclose(fp);
 ++	return rss_anon;
 ++}
 ++
 + uint64_t check_huge(void *addr)
 + {
 + 	uint64_t thp = 0;
 +diff --git a/tools/testing/selftests/vm/vm_util.h b/tools/testing/selftests/vm/vm_util.h
 +index 2e512bd57ae1..00b92ccef20d 100644
 +--- a/tools/testing/selftests/vm/vm_util.h
 ++++ b/tools/testing/selftests/vm/vm_util.h
 +@@ -6,4 +6,5 @@ uint64_t pagemap_get_entry(int fd, char *start);
 + bool pagemap_is_softdirty(int fd, char *start);
 + void clear_softdirty(void);
 + uint64_t read_pmd_pagesize(void);
 ++uint64_t rss_anon(void);
 + uint64_t check_huge(void *addr);
 +-- 
 +2.38.0.rc2
 +
 +From 548ee3c5ecb6abba92c8a237187bac104b55850b Mon Sep 17 00:00:00 2001
 From: Peter Jung <[email protected]>
 Date: Fri, 19 Aug 2022 17:06:47 +0200
 -Subject: [PATCH 10/16] rtw88
 +Subject: [PATCH 11/17] rtw88
 
 Signed-off-by: Peter Jung <[email protected]>
 ---
 @@ -84988,86 +86297,12 @@
  {
  	__le16 fc = hdr->frame_control;
 -- 
 -2.38.0.rc1.8.g2a7d63a245
 -
 -From 953761366f999b9035f8fff70c214426ad9f027b Mon Sep 17 00:00:00 2001
 -From: Peter Jung <[email protected]>
 -Date: Wed, 14 Sep 2022 14:40:34 +0200
 -Subject: [PATCH 11/16] rcu
 -
 -Signed-off-by: Peter Jung <[email protected]>
 ----
 - kernel/rcu/tree_nocb.h | 34 +++++++++++-----------------------
 - 1 file changed, 11 insertions(+), 23 deletions(-)
 -
 -diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
 -index a8f574d8850d..4017ebecec91 100644
 ---- a/kernel/rcu/tree_nocb.h
 -+++ b/kernel/rcu/tree_nocb.h
 -@@ -1210,45 +1210,33 @@ EXPORT_SYMBOL_GPL(rcu_nocb_cpu_offload);
 - void __init rcu_init_nohz(void)
 - {
 - 	int cpu;
 --	bool need_rcu_nocb_mask = false;
 --	bool offload_all = false;
 - 	struct rcu_data *rdp;
 --
 --#if defined(CONFIG_RCU_NOCB_CPU_DEFAULT_ALL)
 --	if (!rcu_state.nocb_is_setup) {
 --		need_rcu_nocb_mask = true;
 --		offload_all = true;
 --	}
 --#endif /* #if defined(CONFIG_RCU_NOCB_CPU_DEFAULT_ALL) */
 -+	const struct cpumask *cpumask = NULL;
 - 
 - #if defined(CONFIG_NO_HZ_FULL)
 --	if (tick_nohz_full_running && !cpumask_empty(tick_nohz_full_mask)) {
 --		need_rcu_nocb_mask = true;
 --		offload_all = false; /* NO_HZ_FULL has its own mask. */
 --	}
 --#endif /* #if defined(CONFIG_NO_HZ_FULL) */
 -+	if (tick_nohz_full_running && !cpumask_empty(tick_nohz_full_mask))
 -+		cpumask = tick_nohz_full_mask;
 -+#endif
 -+
 -+	if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_DEFAULT_ALL) &&
 -+	    !rcu_state.nocb_is_setup && !cpumask)
 -+		cpumask = cpu_possible_mask;
 - 
 --	if (need_rcu_nocb_mask) {
 -+	if (cpumask) {
 - 		if (!cpumask_available(rcu_nocb_mask)) {
 - 			if (!zalloc_cpumask_var(&rcu_nocb_mask, GFP_KERNEL)) {
 - 				pr_info("rcu_nocb_mask allocation failed, callback offloading disabled.\n");
 - 				return;
 - 			}
 - 		}
 -+
 -+		cpumask_or(rcu_nocb_mask, rcu_nocb_mask, cpumask);
 - 		rcu_state.nocb_is_setup = true;
 - 	}
 - 
 - 	if (!rcu_state.nocb_is_setup)
 - 		return;
 - 
 --#if defined(CONFIG_NO_HZ_FULL)
 --	if (tick_nohz_full_running)
 --		cpumask_or(rcu_nocb_mask, rcu_nocb_mask, tick_nohz_full_mask);
 --#endif /* #if defined(CONFIG_NO_HZ_FULL) */
 --
 --	if (offload_all)
 --		cpumask_setall(rcu_nocb_mask);
 --
 - 	if (!cpumask_subset(rcu_nocb_mask, cpu_possible_mask)) {
 - 		pr_info("\tNote: kernel parameter 'rcu_nocbs=', 'nohz_full', or 'isolcpus=' contains nonexistent CPUs.\n");
 - 		cpumask_and(rcu_nocb_mask, cpu_possible_mask,
 --- 
 -2.38.0.rc1.8.g2a7d63a245
 +2.38.0.rc2
 
 -From e2af20ddb7f4e410c25c3deb9dd579d56e340a0b Mon Sep 17 00:00:00 2001
 +From 2407936bbc22b2c76fb8517aee9c24764fe02697 Mon Sep 17 00:00:00 2001
 From: Piotr Gorski <[email protected]>
 Date: Tue, 6 Sep 2022 20:04:11 +0200
 -Subject: [PATCH 12/16] lrng
 +Subject: [PATCH 12/17] lrng
 
 Signed-off-by: Piotr Gorski <[email protected]>
 ---
 @@ -85196,10 +86431,10 @@
  create mode 100644 include/linux/lrng.h
 
 diff --git a/MAINTAINERS b/MAINTAINERS
 -index 96a09757feb3..e3c1b29c60a0 100644
 +index 9a5a422817af..14556e749fb6 100644
 --- a/MAINTAINERS
 +++ b/MAINTAINERS
 -@@ -11741,6 +11741,13 @@ F:	Documentation/litmus-tests/
 +@@ -11740,6 +11740,13 @@ F:	Documentation/litmus-tests/
  F:	Documentation/memory-barriers.txt
  F:	tools/memory-model/
  
 @@ -95661,12 +96896,12 @@
  		return;
  
 -- 
 -2.38.0.rc1.8.g2a7d63a245
 +2.38.0.rc2
 
 -From e1f1e6838dfabd0b23fc9a7ee4dc0d0a91d27680 Mon Sep 17 00:00:00 2001
 +From 0271dda9e4999127b4f97f499a71e7a601135b0e Mon Sep 17 00:00:00 2001
 From: Peter Jung <[email protected]>
 Date: Mon, 19 Sep 2022 14:40:14 +0200
 -Subject: [PATCH 13/16] folios
 +Subject: [PATCH 13/17] folios
 
 Signed-off-by: Peter Jung <[email protected]>
 ---
 @@ -97675,12 +98910,12 @@
   * Perform any setup for the swap system
   */
 -- 
 -2.38.0.rc1.8.g2a7d63a245
 +2.38.0.rc2
 
 -From da70f4396195cb2e56bcfe68c95ea4e31c933e6b Mon Sep 17 00:00:00 2001
 +From 11580e94028d127bbf458c642c5b62f8e3d73328 Mon Sep 17 00:00:00 2001
 From: Peter Jung <[email protected]>
 Date: Mon, 19 Sep 2022 14:42:00 +0200
 -Subject: [PATCH 14/16] fixes
 +Subject: [PATCH 14/17] fixes
 
 Signed-off-by: Peter Jung <[email protected]>
 ---
 @@ -99367,12 +100602,12 @@
  	}
  
 -- 
 -2.38.0.rc1.8.g2a7d63a245
 +2.38.0.rc2
 
 -From 1c95ad8820155c71485f71b29697ed823bcce3b2 Mon Sep 17 00:00:00 2001
 +From 26b540787c916d1cb1759f1c106870a0ca2afc11 Mon Sep 17 00:00:00 2001
 From: Peter Jung <[email protected]>
 Date: Mon, 26 Sep 2022 00:19:51 +0200
 -Subject: [PATCH 15/16] kallsyms
 +Subject: [PATCH 15/17] kallsyms
 
 Signed-off-by: Peter Jung <[email protected]>
 ---
 @@ -99437,10 +100672,10 @@
  
  #endif /* _LINUX_MODULE_H */
 diff --git a/init/Kconfig b/init/Kconfig
 -index 442a945ca6ae..b3a9ec8aa753 100644
 +index f5bd72b39352..274cabde40ab 100644
 --- a/init/Kconfig
 +++ b/init/Kconfig
 -@@ -1742,6 +1742,19 @@ config KALLSYMS
 +@@ -1755,6 +1755,19 @@ config KALLSYMS
  	  symbolic stack backtraces. This increases the size of the kernel
  	  somewhat, as all symbols have to be loaded into the kernel image.
  
 @@ -100508,12 +101743,12 @@
  		}
  }
 -- 
 -2.38.0.rc1.8.g2a7d63a245
 +2.38.0.rc2
 
 -From 2fc2cb736eb578dcdd96ebc321ef6fe31971e7a3 Mon Sep 17 00:00:00 2001
 +From ac75e856b8158802ecf741048b59ad6a91d7d087 Mon Sep 17 00:00:00 2001
 From: Peter Jung <[email protected]>
 Date: Wed, 28 Sep 2022 00:34:04 +0200
 -Subject: [PATCH 16/16] bitmap
 +Subject: [PATCH 16/17] bitmap
 
 Signed-off-by: Peter Jung <[email protected]>
 ---
 @@ -102556,5 +103791,1855 @@
  }
  #endif
 -- 
 -2.38.0.rc1.8.g2a7d63a245
 +2.38.0.rc2
 +
 +From 4fcdfc4036203abf0175a8ae39586cd3ff86e31f Mon Sep 17 00:00:00 2001
 +From: Peter Jung <[email protected]>
 +Date: Sun, 2 Oct 2022 19:11:33 +0200
 +Subject: [PATCH 17/17] rcu
 +
 +Signed-off-by: Peter Jung <[email protected]>
 +---
 + Documentation/RCU/checklist.rst       |  15 +-
 + Documentation/RCU/rcu_dereference.rst |  14 +-
 + Documentation/RCU/whatisRCU.rst       |  47 ++--
 + include/linux/rcupdate.h              |  42 +++-
 + include/linux/rcutiny.h               |  50 ++++
 + include/linux/rcutree.h               |  40 ++++
 + include/linux/srcutiny.h              |  10 +-
 + kernel/rcu/rcutorture.c               | 290 ++++++++++++++++++----
 + kernel/rcu/srcutiny.c                 |  14 +-
 + kernel/rcu/tasks.h                    |   5 +-
 + kernel/rcu/tiny.c                     |  27 ++-
 + kernel/rcu/tree.c                     | 330 ++++++++++++++++++++------
 + kernel/rcu/tree_exp.h                 |  57 ++++-
 + kernel/rcu/tree_nocb.h                |  10 +-
 + kernel/rcu/tree_plugin.h              |  26 +-
 + kernel/rcu/tree_stall.h               |   5 +-
 + kernel/sched/core.c                   |  14 ++
 + kernel/smp.c                          |   3 +-
 + 18 files changed, 813 insertions(+), 186 deletions(-)
 +
 +diff --git a/Documentation/RCU/checklist.rst b/Documentation/RCU/checklist.rst
 +index 42cc5d891bd2..178ca7547b98 100644
 +--- a/Documentation/RCU/checklist.rst
 ++++ b/Documentation/RCU/checklist.rst
 +@@ -66,8 +66,13 @@ over a rather long period of time, but improvements are always welcome!
 + 	As a rough rule of thumb, any dereference of an RCU-protected
 + 	pointer must be covered by rcu_read_lock(), rcu_read_lock_bh(),
 + 	rcu_read_lock_sched(), or by the appropriate update-side lock.
 +-	Disabling of preemption can serve as rcu_read_lock_sched(), but
 +-	is less readable and prevents lockdep from detecting locking issues.
 ++	Explicit disabling of preemption (preempt_disable(), for example)
 ++	can serve as rcu_read_lock_sched(), but is less readable and
 ++	prevents lockdep from detecting locking issues.
 ++
 ++	Please not that you *cannot* rely on code known to be built
 ++	only in non-preemptible kernels.  Such code can and will break,
 ++	especially in kernels built with CONFIG_PREEMPT_COUNT=y.
 + 
 + 	Letting RCU-protected pointers "leak" out of an RCU read-side
 + 	critical section is every bit as bad as letting them leak out
 +@@ -185,6 +190,9 @@ over a rather long period of time, but improvements are always welcome!
 + 
 + 5.	If call_rcu() or call_srcu() is used, the callback function will
 + 	be called from softirq context.  In particular, it cannot block.
 ++	If you need the callback to block, run that code in a workqueue
 ++	handler scheduled from the callback.  The queue_rcu_work()
 ++	function does this for you in the case of call_rcu().
 + 
 + 6.	Since synchronize_rcu() can block, it cannot be called
 + 	from any sort of irq context.  The same rule applies
 +@@ -297,7 +305,8 @@ over a rather long period of time, but improvements are always welcome!
 + 		the machine.
 + 
 + 	d.	Periodically invoke synchronize_rcu(), permitting a limited
 +-		number of updates per grace period.
 ++		number of updates per grace period.  Better yet, periodically
 ++		invoke rcu_barrier() to wait for all outstanding callbacks.
 + 
 + 	The same cautions apply to call_srcu() and kfree_rcu().
 + 
 +diff --git a/Documentation/RCU/rcu_dereference.rst b/Documentation/RCU/rcu_dereference.rst
 +index 0b418a5b243c..81e828c8313b 100644
 +--- a/Documentation/RCU/rcu_dereference.rst
 ++++ b/Documentation/RCU/rcu_dereference.rst
 +@@ -128,10 +128,16 @@ Follow these rules to keep your RCU code working properly:
 + 		This sort of comparison occurs frequently when scanning
 + 		RCU-protected circular linked lists.
 + 
 +-		Note that if checks for being within an RCU read-side
 +-		critical section are not required and the pointer is never
 +-		dereferenced, rcu_access_pointer() should be used in place
 +-		of rcu_dereference().
 ++		Note that if the pointer comparison is done outside
 ++		of an RCU read-side critical section, and the pointer
 ++		is never dereferenced, rcu_access_pointer() should be
 ++		used in place of rcu_dereference().  In most cases,
 ++		it is best to avoid accidental dereferences by testing
 ++		the rcu_access_pointer() return value directly, without
 ++		assigning it to a variable.
 ++
 ++		Within an RCU read-side critical section, there is little
 ++		reason to use rcu_access_pointer().
 + 
 + 	-	The comparison is against a pointer that references memory
 + 		that was initialized "a long time ago."  The reason
 +diff --git a/Documentation/RCU/whatisRCU.rst b/Documentation/RCU/whatisRCU.rst
 +index 77ea260efd12..1c747ac3f2c8 100644
 +--- a/Documentation/RCU/whatisRCU.rst
 ++++ b/Documentation/RCU/whatisRCU.rst
 +@@ -6,13 +6,15 @@ What is RCU?  --  "Read, Copy, Update"
 + Please note that the "What is RCU?" LWN series is an excellent place
 + to start learning about RCU:
 + 
 +-| 1.	What is RCU, Fundamentally?  http://lwn.net/Articles/262464/
 +-| 2.	What is RCU? Part 2: Usage   http://lwn.net/Articles/263130/
 +-| 3.	RCU part 3: the RCU API      http://lwn.net/Articles/264090/
 +-| 4.	The RCU API, 2010 Edition    http://lwn.net/Articles/418853/
 +-| 	2010 Big API Table           http://lwn.net/Articles/419086/
 +-| 5.	The RCU API, 2014 Edition    http://lwn.net/Articles/609904/
 +-|	2014 Big API Table           http://lwn.net/Articles/609973/
 ++| 1.	What is RCU, Fundamentally?  https://lwn.net/Articles/262464/
 ++| 2.	What is RCU? Part 2: Usage   https://lwn.net/Articles/263130/
 ++| 3.	RCU part 3: the RCU API      https://lwn.net/Articles/264090/
 ++| 4.	The RCU API, 2010 Edition    https://lwn.net/Articles/418853/
 ++| 	2010 Big API Table           https://lwn.net/Articles/419086/
 ++| 5.	The RCU API, 2014 Edition    https://lwn.net/Articles/609904/
 ++|	2014 Big API Table           https://lwn.net/Articles/609973/
 ++| 6.	The RCU API, 2019 Edition    https://lwn.net/Articles/777036/
 ++|	2019 Big API Table           https://lwn.net/Articles/777165/
 + 
 + 
 + What is RCU?
 +@@ -915,13 +917,18 @@ which an RCU reference is held include:
 + The understanding that RCU provides a reference that only prevents a
 + change of type is particularly visible with objects allocated from a
 + slab cache marked ``SLAB_TYPESAFE_BY_RCU``.  RCU operations may yield a
 +-reference to an object from such a cache that has been concurrently
 +-freed and the memory reallocated to a completely different object,
 +-though of the same type.  In this case RCU doesn't even protect the
 +-identity of the object from changing, only its type.  So the object
 +-found may not be the one expected, but it will be one where it is safe
 +-to take a reference or spinlock and then confirm that the identity
 +-matches the expectations.
 ++reference to an object from such a cache that has been concurrently freed
 ++and the memory reallocated to a completely different object, though of
 ++the same type.  In this case RCU doesn't even protect the identity of the
 ++object from changing, only its type.  So the object found may not be the
 ++one expected, but it will be one where it is safe to take a reference
 ++(and then potentially acquiring a spinlock), allowing subsequent code
 ++to check whether the identity matches expectations.  It is tempting
 ++to simply acquire the spinlock without first taking the reference, but
 ++unfortunately any spinlock in a ``SLAB_TYPESAFE_BY_RCU`` object must be
 ++initialized after each and every call to kmem_cache_alloc(), which renders
 ++reference-free spinlock acquisition completely unsafe.  Therefore, when
 ++using ``SLAB_TYPESAFE_BY_RCU``, make proper use of a reference counter.
 + 
 + With traditional reference counting -- such as that implemented by the
 + kref library in Linux -- there is typically code that runs when the last
 +@@ -1057,14 +1064,20 @@ SRCU: Initialization/cleanup::
 + 	init_srcu_struct
 + 	cleanup_srcu_struct
 + 
 +-All: lockdep-checked RCU-protected pointer access::
 ++All: lockdep-checked RCU utility APIs::
 + 
 +-	rcu_access_pointer
 +-	rcu_dereference_raw
 + 	RCU_LOCKDEP_WARN
 + 	rcu_sleep_check
 + 	RCU_NONIDLE
 + 
 ++All: Unchecked RCU-protected pointer access::
 ++
 ++	rcu_dereference_raw
 ++
 ++All: Unchecked RCU-protected pointer access with dereferencing prohibited::
 ++
 ++	rcu_access_pointer
 ++
 + See the comment headers in the source code (or the docbook generated
 + from them) for more information.
 + 
 +diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
 +index f527f27e6438..08605ce7379d 100644
 +--- a/include/linux/rcupdate.h
 ++++ b/include/linux/rcupdate.h
 +@@ -42,7 +42,31 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func);
 + void rcu_barrier_tasks(void);
 + void rcu_barrier_tasks_rude(void);
 + void synchronize_rcu(void);
 ++
 ++struct rcu_gp_oldstate;
 + unsigned long get_completed_synchronize_rcu(void);
 ++void get_completed_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp);
 ++
 ++// Maximum number of unsigned long values corresponding to
 ++// not-yet-completed RCU grace periods.
 ++#define NUM_ACTIVE_RCU_POLL_OLDSTATE 2
 ++
 ++/**
 ++ * same_state_synchronize_rcu - Are two old-state values identical?
 ++ * @oldstate1: First old-state value.
 ++ * @oldstate2: Second old-state value.
 ++ *
 ++ * The two old-state values must have been obtained from either
 ++ * get_state_synchronize_rcu(), start_poll_synchronize_rcu(), or
 ++ * get_completed_synchronize_rcu().  Returns @true if the two values are
 ++ * identical and @false otherwise.  This allows structures whose lifetimes
 ++ * are tracked by old-state values to push these values to a list header,
 ++ * allowing those structures to be slightly smaller.
 ++ */
 ++static inline bool same_state_synchronize_rcu(unsigned long oldstate1, unsigned long oldstate2)
 ++{
 ++	return oldstate1 == oldstate2;
 ++}
 + 
 + #ifdef CONFIG_PREEMPT_RCU
 + 
 +@@ -496,13 +520,21 @@ do {									      \
 +  * against NULL.  Although rcu_access_pointer() may also be used in cases
 +  * where update-side locks prevent the value of the pointer from changing,
 +  * you should instead use rcu_dereference_protected() for this use case.
 ++ * Within an RCU read-side critical section, there is little reason to
 ++ * use rcu_access_pointer().
 ++ *
 ++ * It is usually best to test the rcu_access_pointer() return value
 ++ * directly in order to avoid accidental dereferences being introduced
 ++ * by later inattentive changes.  In other words, assigning the
 ++ * rcu_access_pointer() return value to a local variable results in an
 ++ * accident waiting to happen.
 +  *
 +  * It is also permissible to use rcu_access_pointer() when read-side
 +- * access to the pointer was removed at least one grace period ago, as
 +- * is the case in the context of the RCU callback that is freeing up
 +- * the data, or after a synchronize_rcu() returns.  This can be useful
 +- * when tearing down multi-linked structures after a grace period
 +- * has elapsed.
 ++ * access to the pointer was removed at least one grace period ago, as is
 ++ * the case in the context of the RCU callback that is freeing up the data,
 ++ * or after a synchronize_rcu() returns.  This can be useful when tearing
 ++ * down multi-linked structures after a grace period has elapsed.  However,
 ++ * rcu_dereference_protected() is normally preferred for this use case.
 +  */
 + #define rcu_access_pointer(p) __rcu_access_pointer((p), __UNIQUE_ID(rcu), __rcu)
 + 
 +diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
 +index 62815c0a2dce..768196a5f39d 100644
 +--- a/include/linux/rcutiny.h
 ++++ b/include/linux/rcutiny.h
 +@@ -14,25 +14,75 @@
 + 
 + #include <asm/param.h> /* for HZ */
 + 
 ++struct rcu_gp_oldstate {
 ++	unsigned long rgos_norm;
 ++};
 ++
 ++// Maximum number of rcu_gp_oldstate values corresponding to
 ++// not-yet-completed RCU grace periods.
 ++#define NUM_ACTIVE_RCU_POLL_FULL_OLDSTATE 2
 ++
 ++/*
 ++ * Are the two oldstate values the same?  See the Tree RCU version for
 ++ * docbook header.
 ++ */
 ++static inline bool same_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp1,
 ++						   struct rcu_gp_oldstate *rgosp2)
 ++{
 ++	return rgosp1->rgos_norm == rgosp2->rgos_norm;
 ++}
 ++
 + unsigned long get_state_synchronize_rcu(void);
 ++
 ++static inline void get_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp)
 ++{
 ++	rgosp->rgos_norm = get_state_synchronize_rcu();
 ++}
 ++
 + unsigned long start_poll_synchronize_rcu(void);
 ++
 ++static inline void start_poll_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp)
 ++{
 ++	rgosp->rgos_norm = start_poll_synchronize_rcu();
 ++}
 ++
 + bool poll_state_synchronize_rcu(unsigned long oldstate);
 + 
 ++static inline bool poll_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp)
 ++{
 ++	return poll_state_synchronize_rcu(rgosp->rgos_norm);
 ++}
 ++
 + static inline void cond_synchronize_rcu(unsigned long oldstate)
 + {
 + 	might_sleep();
 + }
 + 
 ++static inline void cond_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp)
 ++{
 ++	cond_synchronize_rcu(rgosp->rgos_norm);
 ++}
 ++
 + static inline unsigned long start_poll_synchronize_rcu_expedited(void)
 + {
 + 	return start_poll_synchronize_rcu();
 + }
 + 
 ++static inline void start_poll_synchronize_rcu_expedited_full(struct rcu_gp_oldstate *rgosp)
 ++{
 ++	rgosp->rgos_norm = start_poll_synchronize_rcu_expedited();
 ++}
 ++
 + static inline void cond_synchronize_rcu_expedited(unsigned long oldstate)
 + {
 + 	cond_synchronize_rcu(oldstate);
 + }
 + 
 ++static inline void cond_synchronize_rcu_expedited_full(struct rcu_gp_oldstate *rgosp)
 ++{
 ++	cond_synchronize_rcu_expedited(rgosp->rgos_norm);
 ++}
 ++
 + extern void rcu_barrier(void);
 + 
 + static inline void synchronize_rcu_expedited(void)
 +diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
 +index 47eaa4cb0df7..5efb51486e8a 100644
 +--- a/include/linux/rcutree.h
 ++++ b/include/linux/rcutree.h
 +@@ -40,12 +40,52 @@ bool rcu_eqs_special_set(int cpu);
 + void rcu_momentary_dyntick_idle(void);
 + void kfree_rcu_scheduler_running(void);
 + bool rcu_gp_might_be_stalled(void);
 ++
 ++struct rcu_gp_oldstate {
 ++	unsigned long rgos_norm;
 ++	unsigned long rgos_exp;
 ++};
 ++
 ++// Maximum number of rcu_gp_oldstate values corresponding to
 ++// not-yet-completed RCU grace periods.
 ++#define NUM_ACTIVE_RCU_POLL_FULL_OLDSTATE 4
 ++
 ++/**
 ++ * same_state_synchronize_rcu_full - Are two old-state values identical?
 ++ * @rgosp1: First old-state value.
 ++ * @rgosp2: Second old-state value.
 ++ *
 ++ * The two old-state values must have been obtained from either
 ++ * get_state_synchronize_rcu_full(), start_poll_synchronize_rcu_full(),
 ++ * or get_completed_synchronize_rcu_full().  Returns @true if the two
 ++ * values are identical and @false otherwise.  This allows structures
 ++ * whose lifetimes are tracked by old-state values to push these values
 ++ * to a list header, allowing those structures to be slightly smaller.
 ++ *
 ++ * Note that equality is judged on a bitwise basis, so that an
 ++ * @rcu_gp_oldstate structure with an already-completed state in one field
 ++ * will compare not-equal to a structure with an already-completed state
 ++ * in the other field.  After all, the @rcu_gp_oldstate structure is opaque
 ++ * so how did such a situation come to pass in the first place?
 ++ */
 ++static inline bool same_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp1,
 ++						   struct rcu_gp_oldstate *rgosp2)
 ++{
 ++	return rgosp1->rgos_norm == rgosp2->rgos_norm && rgosp1->rgos_exp == rgosp2->rgos_exp;
 ++}
 ++
 + unsigned long start_poll_synchronize_rcu_expedited(void);
 ++void start_poll_synchronize_rcu_expedited_full(struct rcu_gp_oldstate *rgosp);
 + void cond_synchronize_rcu_expedited(unsigned long oldstate);
 ++void cond_synchronize_rcu_expedited_full(struct rcu_gp_oldstate *rgosp);
 + unsigned long get_state_synchronize_rcu(void);
 ++void get_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp);
 + unsigned long start_poll_synchronize_rcu(void);
 ++void start_poll_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp);
 + bool poll_state_synchronize_rcu(unsigned long oldstate);
 ++bool poll_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp);
 + void cond_synchronize_rcu(unsigned long oldstate);
 ++void cond_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp);
 + 
 + bool rcu_is_idle_cpu(int cpu);
 + 
 +diff --git a/include/linux/srcutiny.h b/include/linux/srcutiny.h
 +index 6cfaa0a9a9b9..5aa5e0faf6a1 100644
 +--- a/include/linux/srcutiny.h
 ++++ b/include/linux/srcutiny.h
 +@@ -15,10 +15,10 @@
 + 
 + struct srcu_struct {
 + 	short srcu_lock_nesting[2];	/* srcu_read_lock() nesting depth. */
 +-	unsigned short srcu_idx;	/* Current reader array element in bit 0x2. */
 +-	unsigned short srcu_idx_max;	/* Furthest future srcu_idx request. */
 + 	u8 srcu_gp_running;		/* GP workqueue running? */
 + 	u8 srcu_gp_waiting;		/* GP waiting for readers? */
 ++	unsigned long srcu_idx;		/* Current reader array element in bit 0x2. */
 ++	unsigned long srcu_idx_max;	/* Furthest future srcu_idx request. */
 + 	struct swait_queue_head srcu_wq;
 + 					/* Last srcu_read_unlock() wakes GP. */
 + 	struct rcu_head *srcu_cb_head;	/* Pending callbacks: Head. */
 +@@ -82,10 +82,12 @@ static inline void srcu_torture_stats_print(struct srcu_struct *ssp,
 + 	int idx;
 + 
 + 	idx = ((data_race(READ_ONCE(ssp->srcu_idx)) + 1) & 0x2) >> 1;
 +-	pr_alert("%s%s Tiny SRCU per-CPU(idx=%d): (%hd,%hd)\n",
 ++	pr_alert("%s%s Tiny SRCU per-CPU(idx=%d): (%hd,%hd) gp: %lu->%lu\n",
 + 		 tt, tf, idx,
 + 		 data_race(READ_ONCE(ssp->srcu_lock_nesting[!idx])),
 +-		 data_race(READ_ONCE(ssp->srcu_lock_nesting[idx])));
 ++		 data_race(READ_ONCE(ssp->srcu_lock_nesting[idx])),
 ++		 data_race(READ_ONCE(ssp->srcu_idx)),
 ++		 data_race(READ_ONCE(ssp->srcu_idx_max)));
 + }
 + 
 + #endif
 +diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
 +index d8e1b270a065..503c2aa845a4 100644
 +--- a/kernel/rcu/rcutorture.c
 ++++ b/kernel/rcu/rcutorture.c
 +@@ -84,10 +84,15 @@ torture_param(int, fwd_progress_holdoff, 60, "Time between forward-progress test
 + torture_param(bool, fwd_progress_need_resched, 1, "Hide cond_resched() behind need_resched()");
 + torture_param(bool, gp_cond, false, "Use conditional/async GP wait primitives");
 + torture_param(bool, gp_cond_exp, false, "Use conditional/async expedited GP wait primitives");
 ++torture_param(bool, gp_cond_full, false, "Use conditional/async full-state GP wait primitives");
 ++torture_param(bool, gp_cond_exp_full, false,
 ++		    "Use conditional/async full-stateexpedited GP wait primitives");
 + torture_param(bool, gp_exp, false, "Use expedited GP wait primitives");
 + torture_param(bool, gp_normal, false, "Use normal (non-expedited) GP wait primitives");
 + torture_param(bool, gp_poll, false, "Use polling GP wait primitives");
 + torture_param(bool, gp_poll_exp, false, "Use polling expedited GP wait primitives");
 ++torture_param(bool, gp_poll_full, false, "Use polling full-state GP wait primitives");
 ++torture_param(bool, gp_poll_exp_full, false, "Use polling full-state expedited GP wait primitives");
 + torture_param(bool, gp_sync, false, "Use synchronous GP wait primitives");
 + torture_param(int, irqreader, 1, "Allow RCU readers from irq handlers");
 + torture_param(int, leakpointer, 0, "Leak pointer dereferences from readers");
 +@@ -194,16 +199,24 @@ static int rcu_torture_writer_state;
 + #define RTWS_DEF_FREE		3
 + #define RTWS_EXP_SYNC		4
 + #define RTWS_COND_GET		5
 +-#define RTWS_COND_GET_EXP	6
 +-#define RTWS_COND_SYNC		7
 +-#define RTWS_COND_SYNC_EXP	8
 +-#define RTWS_POLL_GET		9
 +-#define RTWS_POLL_GET_EXP	10
 +-#define RTWS_POLL_WAIT		11
 +-#define RTWS_POLL_WAIT_EXP	12
 +-#define RTWS_SYNC		13
 +-#define RTWS_STUTTER		14
 +-#define RTWS_STOPPING		15
 ++#define RTWS_COND_GET_FULL	6
 ++#define RTWS_COND_GET_EXP	7
 ++#define RTWS_COND_GET_EXP_FULL	8
 ++#define RTWS_COND_SYNC		9
 ++#define RTWS_COND_SYNC_FULL	10
 ++#define RTWS_COND_SYNC_EXP	11
 ++#define RTWS_COND_SYNC_EXP_FULL	12
 ++#define RTWS_POLL_GET		13
 ++#define RTWS_POLL_GET_FULL	14
 ++#define RTWS_POLL_GET_EXP	15
 ++#define RTWS_POLL_GET_EXP_FULL	16
 ++#define RTWS_POLL_WAIT		17
 ++#define RTWS_POLL_WAIT_FULL	18
 ++#define RTWS_POLL_WAIT_EXP	19
 ++#define RTWS_POLL_WAIT_EXP_FULL	20
 ++#define RTWS_SYNC		21
 ++#define RTWS_STUTTER		22
 ++#define RTWS_STOPPING		23
 + static const char * const rcu_torture_writer_state_names[] = {
 + 	"RTWS_FIXED_DELAY",
 + 	"RTWS_DELAY",
 +@@ -211,13 +224,21 @@ static const char * const rcu_torture_writer_state_names[] = {
 + 	"RTWS_DEF_FREE",
 + 	"RTWS_EXP_SYNC",
 + 	"RTWS_COND_GET",
 ++	"RTWS_COND_GET_FULL",
 + 	"RTWS_COND_GET_EXP",
 ++	"RTWS_COND_GET_EXP_FULL",
 + 	"RTWS_COND_SYNC",
 ++	"RTWS_COND_SYNC_FULL",
 + 	"RTWS_COND_SYNC_EXP",
 ++	"RTWS_COND_SYNC_EXP_FULL",
 + 	"RTWS_POLL_GET",
 ++	"RTWS_POLL_GET_FULL",
 + 	"RTWS_POLL_GET_EXP",
 ++	"RTWS_POLL_GET_EXP_FULL",
 + 	"RTWS_POLL_WAIT",
 ++	"RTWS_POLL_WAIT_FULL",
 + 	"RTWS_POLL_WAIT_EXP",
 ++	"RTWS_POLL_WAIT_EXP_FULL",
 + 	"RTWS_SYNC",
 + 	"RTWS_STUTTER",
 + 	"RTWS_STOPPING",
 +@@ -332,13 +353,21 @@ struct rcu_torture_ops {
 + 	void (*exp_sync)(void);
 + 	unsigned long (*get_gp_state_exp)(void);
 + 	unsigned long (*start_gp_poll_exp)(void);
 ++	void (*start_gp_poll_exp_full)(struct rcu_gp_oldstate *rgosp);
 + 	bool (*poll_gp_state_exp)(unsigned long oldstate);
 + 	void (*cond_sync_exp)(unsigned long oldstate);
 ++	void (*cond_sync_exp_full)(struct rcu_gp_oldstate *rgosp);
 + 	unsigned long (*get_gp_state)(void);
 ++	void (*get_gp_state_full)(struct rcu_gp_oldstate *rgosp);
 + 	unsigned long (*get_gp_completed)(void);
 ++	void (*get_gp_completed_full)(struct rcu_gp_oldstate *rgosp);
 + 	unsigned long (*start_gp_poll)(void);
 ++	void (*start_gp_poll_full)(struct rcu_gp_oldstate *rgosp);
 + 	bool (*poll_gp_state)(unsigned long oldstate);
 ++	bool (*poll_gp_state_full)(struct rcu_gp_oldstate *rgosp);
 ++	bool (*poll_need_2gp)(bool poll, bool poll_full);
 + 	void (*cond_sync)(unsigned long oldstate);
 ++	void (*cond_sync_full)(struct rcu_gp_oldstate *rgosp);
 + 	call_rcu_func_t call;
 + 	void (*cb_barrier)(void);
 + 	void (*fqs)(void);
 +@@ -489,6 +518,11 @@ static void rcu_sync_torture_init(void)
 + 	INIT_LIST_HEAD(&rcu_torture_removed);
 + }
 + 
 ++static bool rcu_poll_need_2gp(bool poll, bool poll_full)
 ++{
 ++	return poll;
 ++}
 ++
 + static struct rcu_torture_ops rcu_ops = {
 + 	.ttype			= RCU_FLAVOR,
 + 	.init			= rcu_sync_torture_init,
 +@@ -502,12 +536,19 @@ static struct rcu_torture_ops rcu_ops = {
 + 	.sync			= synchronize_rcu,
 + 	.exp_sync		= synchronize_rcu_expedited,
 + 	.get_gp_state		= get_state_synchronize_rcu,
 ++	.get_gp_state_full	= get_state_synchronize_rcu_full,
 + 	.get_gp_completed	= get_completed_synchronize_rcu,
 ++	.get_gp_completed_full	= get_completed_synchronize_rcu_full,
 + 	.start_gp_poll		= start_poll_synchronize_rcu,
 ++	.start_gp_poll_full	= start_poll_synchronize_rcu_full,
 + 	.poll_gp_state		= poll_state_synchronize_rcu,
 ++	.poll_gp_state_full	= poll_state_synchronize_rcu_full,
 ++	.poll_need_2gp		= rcu_poll_need_2gp,
 + 	.cond_sync		= cond_synchronize_rcu,
 ++	.cond_sync_full		= cond_synchronize_rcu_full,
 + 	.get_gp_state_exp	= get_state_synchronize_rcu,
 + 	.start_gp_poll_exp	= start_poll_synchronize_rcu_expedited,
 ++	.start_gp_poll_exp_full	= start_poll_synchronize_rcu_expedited_full,
 + 	.poll_gp_state_exp	= poll_state_synchronize_rcu,
 + 	.cond_sync_exp		= cond_synchronize_rcu_expedited,
 + 	.call			= call_rcu,
 +@@ -709,6 +750,9 @@ static struct rcu_torture_ops srcud_ops = {
 + 	.deferred_free	= srcu_torture_deferred_free,
 + 	.sync		= srcu_torture_synchronize,
 + 	.exp_sync	= srcu_torture_synchronize_expedited,
 ++	.get_gp_state	= srcu_torture_get_gp_state,
 ++	.start_gp_poll	= srcu_torture_start_gp_poll,
 ++	.poll_gp_state	= srcu_torture_poll_gp_state,
 + 	.call		= srcu_torture_call,
 + 	.cb_barrier	= srcu_torture_barrier,
 + 	.stats		= srcu_torture_stats,
 +@@ -1148,15 +1192,35 @@ static int nsynctypes;
 +  */
 + static void rcu_torture_write_types(void)
 + {
 +-	bool gp_cond1 = gp_cond, gp_cond_exp1 = gp_cond_exp, gp_exp1 = gp_exp;
 +-	bool gp_poll_exp1 = gp_poll_exp, gp_normal1 = gp_normal, gp_poll1 = gp_poll;
 +-	bool gp_sync1 = gp_sync;
 ++	bool gp_cond1 = gp_cond, gp_cond_exp1 = gp_cond_exp, gp_cond_full1 = gp_cond_full;
 ++	bool gp_cond_exp_full1 = gp_cond_exp_full, gp_exp1 = gp_exp, gp_poll_exp1 = gp_poll_exp;
 ++	bool gp_poll_exp_full1 = gp_poll_exp_full, gp_normal1 = gp_normal, gp_poll1 = gp_poll;
 ++	bool gp_poll_full1 = gp_poll_full, gp_sync1 = gp_sync;
 + 
 + 	/* Initialize synctype[] array.  If none set, take default. */
 +-	if (!gp_cond1 && !gp_cond_exp1 && !gp_exp1 && !gp_poll_exp &&
 +-	    !gp_normal1 && !gp_poll1 && !gp_sync1)
 +-		gp_cond1 = gp_cond_exp1 = gp_exp1 = gp_poll_exp1 =
 +-			   gp_normal1 = gp_poll1 = gp_sync1 = true;
 ++	if (!gp_cond1 &&
 ++	    !gp_cond_exp1 &&
 ++	    !gp_cond_full1 &&
 ++	    !gp_cond_exp_full1 &&
 ++	    !gp_exp1 &&
 ++	    !gp_poll_exp1 &&
 ++	    !gp_poll_exp_full1 &&
 ++	    !gp_normal1 &&
 ++	    !gp_poll1 &&
 ++	    !gp_poll_full1 &&
 ++	    !gp_sync1) {
 ++		gp_cond1 = true;
 ++		gp_cond_exp1 = true;
 ++		gp_cond_full1 = true;
 ++		gp_cond_exp_full1 = true;
 ++		gp_exp1 = true;
 ++		gp_poll_exp1 = true;
 ++		gp_poll_exp_full1 = true;
 ++		gp_normal1 = true;
 ++		gp_poll1 = true;
 ++		gp_poll_full1 = true;
 ++		gp_sync1 = true;
 ++	}
 + 	if (gp_cond1 && cur_ops->get_gp_state && cur_ops->cond_sync) {
 + 		synctype[nsynctypes++] = RTWS_COND_GET;
 + 		pr_info("%s: Testing conditional GPs.\n", __func__);
 +@@ -1169,6 +1233,19 @@ static void rcu_torture_write_types(void)
 + 	} else if (gp_cond_exp && (!cur_ops->get_gp_state_exp || !cur_ops->cond_sync_exp)) {
 + 		pr_alert("%s: gp_cond_exp without primitives.\n", __func__);
 + 	}
 ++	if (gp_cond_full1 && cur_ops->get_gp_state && cur_ops->cond_sync_full) {
 ++		synctype[nsynctypes++] = RTWS_COND_GET_FULL;
 ++		pr_info("%s: Testing conditional full-state GPs.\n", __func__);
 ++	} else if (gp_cond_full && (!cur_ops->get_gp_state || !cur_ops->cond_sync_full)) {
 ++		pr_alert("%s: gp_cond_full without primitives.\n", __func__);
 ++	}
 ++	if (gp_cond_exp_full1 && cur_ops->get_gp_state_exp && cur_ops->cond_sync_exp_full) {
 ++		synctype[nsynctypes++] = RTWS_COND_GET_EXP_FULL;
 ++		pr_info("%s: Testing conditional full-state expedited GPs.\n", __func__);
 ++	} else if (gp_cond_exp_full &&
 ++		   (!cur_ops->get_gp_state_exp || !cur_ops->cond_sync_exp_full)) {
 ++		pr_alert("%s: gp_cond_exp_full without primitives.\n", __func__);
 ++	}
 + 	if (gp_exp1 && cur_ops->exp_sync) {
 + 		synctype[nsynctypes++] = RTWS_EXP_SYNC;
 + 		pr_info("%s: Testing expedited GPs.\n", __func__);
 +@@ -1187,12 +1264,25 @@ static void rcu_torture_write_types(void)
 + 	} else if (gp_poll && (!cur_ops->start_gp_poll || !cur_ops->poll_gp_state)) {
 + 		pr_alert("%s: gp_poll without primitives.\n", __func__);
 + 	}
 ++	if (gp_poll_full1 && cur_ops->start_gp_poll_full && cur_ops->poll_gp_state_full) {
 ++		synctype[nsynctypes++] = RTWS_POLL_GET_FULL;
 ++		pr_info("%s: Testing polling full-state GPs.\n", __func__);
 ++	} else if (gp_poll_full && (!cur_ops->start_gp_poll_full || !cur_ops->poll_gp_state_full)) {
 ++		pr_alert("%s: gp_poll_full without primitives.\n", __func__);
 ++	}
 + 	if (gp_poll_exp1 && cur_ops->start_gp_poll_exp && cur_ops->poll_gp_state_exp) {
 + 		synctype[nsynctypes++] = RTWS_POLL_GET_EXP;
 + 		pr_info("%s: Testing polling expedited GPs.\n", __func__);
 + 	} else if (gp_poll_exp && (!cur_ops->start_gp_poll_exp || !cur_ops->poll_gp_state_exp)) {
 + 		pr_alert("%s: gp_poll_exp without primitives.\n", __func__);
 + 	}
 ++	if (gp_poll_exp_full1 && cur_ops->start_gp_poll_exp_full && cur_ops->poll_gp_state_full) {
 ++		synctype[nsynctypes++] = RTWS_POLL_GET_EXP_FULL;
 ++		pr_info("%s: Testing polling full-state expedited GPs.\n", __func__);
 ++	} else if (gp_poll_exp_full &&
 ++		   (!cur_ops->start_gp_poll_exp_full || !cur_ops->poll_gp_state_full)) {
 ++		pr_alert("%s: gp_poll_exp_full without primitives.\n", __func__);
 ++	}
 + 	if (gp_sync1 && cur_ops->sync) {
 + 		synctype[nsynctypes++] = RTWS_SYNC;
 + 		pr_info("%s: Testing normal GPs.\n", __func__);
 +@@ -1201,6 +1291,40 @@ static void rcu_torture_write_types(void)
 + 	}
 + }
 + 
 ++/*
 ++ * Do the specified rcu_torture_writer() synchronous grace period,
 ++ * while also testing out the polled APIs.  Note well that the single-CPU
 ++ * grace-period optimizations must be accounted for.
 ++ */
 ++static void do_rtws_sync(struct torture_random_state *trsp, void (*sync)(void))
 ++{
 ++	unsigned long cookie;
 ++	struct rcu_gp_oldstate cookie_full;
 ++	bool dopoll;
 ++	bool dopoll_full;
 ++	unsigned long r = torture_random(trsp);
 ++
 ++	dopoll = cur_ops->get_gp_state && cur_ops->poll_gp_state && !(r & 0x300);
 ++	dopoll_full = cur_ops->get_gp_state_full && cur_ops->poll_gp_state_full && !(r & 0xc00);
 ++	if (dopoll || dopoll_full)
 ++		cpus_read_lock();
 ++	if (dopoll)
 ++		cookie = cur_ops->get_gp_state();
 ++	if (dopoll_full)
 ++		cur_ops->get_gp_state_full(&cookie_full);
 ++	if (cur_ops->poll_need_2gp && cur_ops->poll_need_2gp(dopoll, dopoll_full))
 ++		sync();
 ++	sync();
 ++	WARN_ONCE(dopoll && !cur_ops->poll_gp_state(cookie),
 ++		  "%s: Cookie check 3 failed %pS() online %*pbl.",
 ++		  __func__, sync, cpumask_pr_args(cpu_online_mask));
 ++	WARN_ONCE(dopoll_full && !cur_ops->poll_gp_state_full(&cookie_full),
 ++		  "%s: Cookie check 4 failed %pS() online %*pbl",
 ++		  __func__, sync, cpumask_pr_args(cpu_online_mask));
 ++	if (dopoll || dopoll_full)
 ++		cpus_read_unlock();
 ++}
 ++
 + /*
 +  * RCU torture writer kthread.  Repeatedly substitutes a new structure
 +  * for that pointed to by rcu_torture_current, freeing the old structure
 +@@ -1212,8 +1336,10 @@ rcu_torture_writer(void *arg)
 + 	bool boot_ended;
 + 	bool can_expedite = !rcu_gp_is_expedited() && !rcu_gp_is_normal();
 + 	unsigned long cookie;
 ++	struct rcu_gp_oldstate cookie_full;
 + 	int expediting = 0;
 + 	unsigned long gp_snap;
 ++	struct rcu_gp_oldstate gp_snap_full;
 + 	int i;
 + 	int idx;
 + 	int oldnice = task_nice(current);
 +@@ -1261,11 +1387,12 @@ rcu_torture_writer(void *arg)
 + 			atomic_inc(&rcu_torture_wcount[i]);
 + 			WRITE_ONCE(old_rp->rtort_pipe_count,
 + 				   old_rp->rtort_pipe_count + 1);
 ++
 ++			// Make sure readers block polled grace periods.
 + 			if (cur_ops->get_gp_state && cur_ops->poll_gp_state) {
 + 				idx = cur_ops->readlock();
 + 				cookie = cur_ops->get_gp_state();
 +-				WARN_ONCE(rcu_torture_writer_state != RTWS_DEF_FREE &&
 +-					  cur_ops->poll_gp_state(cookie),
 ++				WARN_ONCE(cur_ops->poll_gp_state(cookie),
 + 					  "%s: Cookie check 1 failed %s(%d) %lu->%lu\n",
 + 					  __func__,
 + 					  rcu_torture_writer_state_getname(),
 +@@ -1277,6 +1404,21 @@ rcu_torture_writer(void *arg)
 + 				}
 + 				cur_ops->readunlock(idx);
 + 			}
 ++			if (cur_ops->get_gp_state_full && cur_ops->poll_gp_state_full) {
 ++				idx = cur_ops->readlock();
 ++				cur_ops->get_gp_state_full(&cookie_full);
 ++				WARN_ONCE(cur_ops->poll_gp_state_full(&cookie_full),
 ++					  "%s: Cookie check 5 failed %s(%d) online %*pbl\n",
 ++					  __func__,
 ++					  rcu_torture_writer_state_getname(),
 ++					  rcu_torture_writer_state,
 ++					  cpumask_pr_args(cpu_online_mask));
 ++				if (cur_ops->get_gp_completed_full) {
 ++					cur_ops->get_gp_completed_full(&cookie_full);
 ++					WARN_ON_ONCE(!cur_ops->poll_gp_state_full(&cookie_full));
 ++				}
 ++				cur_ops->readunlock(idx);
 ++			}
 + 			switch (synctype[torture_random(&rand) % nsynctypes]) {
 + 			case RTWS_DEF_FREE:
 + 				rcu_torture_writer_state = RTWS_DEF_FREE;
 +@@ -1284,12 +1426,7 @@ rcu_torture_writer(void *arg)
 + 				break;
 + 			case RTWS_EXP_SYNC:
 + 				rcu_torture_writer_state = RTWS_EXP_SYNC;
 +-				if (cur_ops->get_gp_state && cur_ops->poll_gp_state)
 +-					cookie = cur_ops->get_gp_state();
 +-				cur_ops->exp_sync();
 +-				cur_ops->exp_sync();
 +-				if (cur_ops->get_gp_state && cur_ops->poll_gp_state)
 +-					WARN_ON_ONCE(!cur_ops->poll_gp_state(cookie));
 ++				do_rtws_sync(&rand, cur_ops->exp_sync);
 + 				rcu_torture_pipe_update(old_rp);
 + 				break;
 + 			case RTWS_COND_GET:
 +@@ -1308,6 +1445,22 @@ rcu_torture_writer(void *arg)
 + 				cur_ops->cond_sync_exp(gp_snap);
 + 				rcu_torture_pipe_update(old_rp);
 + 				break;
 ++			case RTWS_COND_GET_FULL:
 ++				rcu_torture_writer_state = RTWS_COND_GET_FULL;
 ++				cur_ops->get_gp_state_full(&gp_snap_full);
 ++				torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand);
 ++				rcu_torture_writer_state = RTWS_COND_SYNC_FULL;
 ++				cur_ops->cond_sync_full(&gp_snap_full);
 ++				rcu_torture_pipe_update(old_rp);
 ++				break;
 ++			case RTWS_COND_GET_EXP_FULL:
 ++				rcu_torture_writer_state = RTWS_COND_GET_EXP_FULL;
 ++				cur_ops->get_gp_state_full(&gp_snap_full);
 ++				torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand);
 ++				rcu_torture_writer_state = RTWS_COND_SYNC_EXP_FULL;
 ++				cur_ops->cond_sync_exp_full(&gp_snap_full);
 ++				rcu_torture_pipe_update(old_rp);
 ++				break;
 + 			case RTWS_POLL_GET:
 + 				rcu_torture_writer_state = RTWS_POLL_GET;
 + 				gp_snap = cur_ops->start_gp_poll();
 +@@ -1317,6 +1470,15 @@ rcu_torture_writer(void *arg)
 + 								  &rand);
 + 				rcu_torture_pipe_update(old_rp);
 + 				break;
 ++			case RTWS_POLL_GET_FULL:
 ++				rcu_torture_writer_state = RTWS_POLL_GET_FULL;
 ++				cur_ops->start_gp_poll_full(&gp_snap_full);
 ++				rcu_torture_writer_state = RTWS_POLL_WAIT_FULL;
 ++				while (!cur_ops->poll_gp_state_full(&gp_snap_full))
 ++					torture_hrtimeout_jiffies(torture_random(&rand) % 16,
 ++								  &rand);
 ++				rcu_torture_pipe_update(old_rp);
 ++				break;
 + 			case RTWS_POLL_GET_EXP:
 + 				rcu_torture_writer_state = RTWS_POLL_GET_EXP;
 + 				gp_snap = cur_ops->start_gp_poll_exp();
 +@@ -1326,14 +1488,18 @@ rcu_torture_writer(void *arg)
 + 								  &rand);
 + 				rcu_torture_pipe_update(old_rp);
 + 				break;
 ++			case RTWS_POLL_GET_EXP_FULL:
 ++				rcu_torture_writer_state = RTWS_POLL_GET_EXP_FULL;
 ++				cur_ops->start_gp_poll_exp_full(&gp_snap_full);
 ++				rcu_torture_writer_state = RTWS_POLL_WAIT_EXP_FULL;
 ++				while (!cur_ops->poll_gp_state_full(&gp_snap_full))
 ++					torture_hrtimeout_jiffies(torture_random(&rand) % 16,
 ++								  &rand);
 ++				rcu_torture_pipe_update(old_rp);
 ++				break;
 + 			case RTWS_SYNC:
 + 				rcu_torture_writer_state = RTWS_SYNC;
 +-				if (cur_ops->get_gp_state && cur_ops->poll_gp_state)
 +-					cookie = cur_ops->get_gp_state();
 +-				cur_ops->sync();
 +-				cur_ops->sync();
 +-				if (cur_ops->get_gp_state && cur_ops->poll_gp_state)
 +-					WARN_ON_ONCE(!cur_ops->poll_gp_state(cookie));
 ++				do_rtws_sync(&rand, cur_ops->sync);
 + 				rcu_torture_pipe_update(old_rp);
 + 				break;
 + 			default:
 +@@ -1400,6 +1566,7 @@ static int
 + rcu_torture_fakewriter(void *arg)
 + {
 + 	unsigned long gp_snap;
 ++	struct rcu_gp_oldstate gp_snap_full;
 + 	DEFINE_TORTURE_RANDOM(rand);
 + 
 + 	VERBOSE_TOROUT_STRING("rcu_torture_fakewriter task started");
 +@@ -1438,6 +1605,16 @@ rcu_torture_fakewriter(void *arg)
 + 				torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand);
 + 				cur_ops->cond_sync_exp(gp_snap);
 + 				break;
 ++			case RTWS_COND_GET_FULL:
 ++				cur_ops->get_gp_state_full(&gp_snap_full);
 ++				torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand);
 ++				cur_ops->cond_sync_full(&gp_snap_full);
 ++				break;
 ++			case RTWS_COND_GET_EXP_FULL:
 ++				cur_ops->get_gp_state_full(&gp_snap_full);
 ++				torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand);
 ++				cur_ops->cond_sync_exp_full(&gp_snap_full);
 ++				break;
 + 			case RTWS_POLL_GET:
 + 				gp_snap = cur_ops->start_gp_poll();
 + 				while (!cur_ops->poll_gp_state(gp_snap)) {
 +@@ -1445,6 +1622,13 @@ rcu_torture_fakewriter(void *arg)
 + 								  &rand);
 + 				}
 + 				break;
 ++			case RTWS_POLL_GET_FULL:
 ++				cur_ops->start_gp_poll_full(&gp_snap_full);
 ++				while (!cur_ops->poll_gp_state_full(&gp_snap_full)) {
 ++					torture_hrtimeout_jiffies(torture_random(&rand) % 16,
 ++								  &rand);
 ++				}
 ++				break;
 + 			case RTWS_POLL_GET_EXP:
 + 				gp_snap = cur_ops->start_gp_poll_exp();
 + 				while (!cur_ops->poll_gp_state_exp(gp_snap)) {
 +@@ -1452,6 +1636,13 @@ rcu_torture_fakewriter(void *arg)
 + 								  &rand);
 + 				}
 + 				break;
 ++			case RTWS_POLL_GET_EXP_FULL:
 ++				cur_ops->start_gp_poll_exp_full(&gp_snap_full);
 ++				while (!cur_ops->poll_gp_state_full(&gp_snap_full)) {
 ++					torture_hrtimeout_jiffies(torture_random(&rand) % 16,
 ++								  &rand);
 ++				}
 ++				break;
 + 			case RTWS_SYNC:
 + 				cur_ops->sync();
 + 				break;
 +@@ -1715,7 +1906,9 @@ rcutorture_loop_extend(int *readstate, struct torture_random_state *trsp,
 +  */
 + static bool rcu_torture_one_read(struct torture_random_state *trsp, long myid)
 + {
 ++	bool checkpolling = !(torture_random(trsp) & 0xfff);
 + 	unsigned long cookie;
 ++	struct rcu_gp_oldstate cookie_full;
 + 	int i;
 + 	unsigned long started;
 + 	unsigned long completed;
 +@@ -1731,8 +1924,12 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp, long myid)
 + 	WARN_ON_ONCE(!rcu_is_watching());
 + 	newstate = rcutorture_extend_mask(readstate, trsp);
 + 	rcutorture_one_extend(&readstate, newstate, trsp, rtrsp++);
 +-	if (cur_ops->get_gp_state && cur_ops->poll_gp_state)
 +-		cookie = cur_ops->get_gp_state();
 ++	if (checkpolling) {
 ++		if (cur_ops->get_gp_state && cur_ops->poll_gp_state)
 ++			cookie = cur_ops->get_gp_state();
 ++		if (cur_ops->get_gp_state_full && cur_ops->poll_gp_state_full)
 ++			cur_ops->get_gp_state_full(&cookie_full);
 ++	}
 + 	started = cur_ops->get_gp_seq();
 + 	ts = rcu_trace_clock_local();
 + 	p = rcu_dereference_check(rcu_torture_current,
 +@@ -1766,13 +1963,22 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp, long myid)
 + 	}
 + 	__this_cpu_inc(rcu_torture_batch[completed]);
 + 	preempt_enable();
 +-	if (cur_ops->get_gp_state && cur_ops->poll_gp_state)
 +-		WARN_ONCE(cur_ops->poll_gp_state(cookie),
 +-			  "%s: Cookie check 2 failed %s(%d) %lu->%lu\n",
 +-			  __func__,
 +-			  rcu_torture_writer_state_getname(),
 +-			  rcu_torture_writer_state,
 +-			  cookie, cur_ops->get_gp_state());
 ++	if (checkpolling) {
 ++		if (cur_ops->get_gp_state && cur_ops->poll_gp_state)
 ++			WARN_ONCE(cur_ops->poll_gp_state(cookie),
 ++				  "%s: Cookie check 2 failed %s(%d) %lu->%lu\n",
 ++				  __func__,
 ++				  rcu_torture_writer_state_getname(),
 ++				  rcu_torture_writer_state,
 ++				  cookie, cur_ops->get_gp_state());
 ++		if (cur_ops->get_gp_state_full && cur_ops->poll_gp_state_full)
 ++			WARN_ONCE(cur_ops->poll_gp_state_full(&cookie_full),
 ++				  "%s: Cookie check 6 failed %s(%d) online %*pbl\n",
 ++				  __func__,
 ++				  rcu_torture_writer_state_getname(),
 ++				  rcu_torture_writer_state,
 ++				  cpumask_pr_args(cpu_online_mask));
 ++	}
 + 	rcutorture_one_extend(&readstate, 0, trsp, rtrsp);
 + 	WARN_ON_ONCE(readstate);
 + 	// This next splat is expected behavior if leakpointer, especially
 +@@ -2600,12 +2806,12 @@ static int rcutorture_oom_notify(struct notifier_block *self,
 + 	for (i = 0; i < fwd_progress; i++)
 + 		ncbs += rcu_torture_fwd_prog_cbfree(&rfp[i]);
 + 	pr_info("%s: Freed %lu RCU callbacks.\n", __func__, ncbs);
 +-	rcu_barrier();
 ++	cur_ops->cb_barrier();
 + 	ncbs = 0;
 + 	for (i = 0; i < fwd_progress; i++)
 + 		ncbs += rcu_torture_fwd_prog_cbfree(&rfp[i]);
 + 	pr_info("%s: Freed %lu RCU callbacks.\n", __func__, ncbs);
 +-	rcu_barrier();
 ++	cur_ops->cb_barrier();
 + 	ncbs = 0;
 + 	for (i = 0; i < fwd_progress; i++)
 + 		ncbs += rcu_torture_fwd_prog_cbfree(&rfp[i]);
 +diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c
 +index 92c002d65482..33adafdad261 100644
 +--- a/kernel/rcu/srcutiny.c
 ++++ b/kernel/rcu/srcutiny.c
 +@@ -117,7 +117,7 @@ void srcu_drive_gp(struct work_struct *wp)
 + 	struct srcu_struct *ssp;
 + 
 + 	ssp = container_of(wp, struct srcu_struct, srcu_work);
 +-	if (ssp->srcu_gp_running || USHORT_CMP_GE(ssp->srcu_idx, READ_ONCE(ssp->srcu_idx_max)))
 ++	if (ssp->srcu_gp_running || ULONG_CMP_GE(ssp->srcu_idx, READ_ONCE(ssp->srcu_idx_max)))
 + 		return; /* Already running or nothing to do. */
 + 
 + 	/* Remove recently arrived callbacks and wait for readers. */
 +@@ -150,17 +150,17 @@ void srcu_drive_gp(struct work_struct *wp)
 + 	 * straighten that out.
 + 	 */
 + 	WRITE_ONCE(ssp->srcu_gp_running, false);
 +-	if (USHORT_CMP_LT(ssp->srcu_idx, READ_ONCE(ssp->srcu_idx_max)))
 ++	if (ULONG_CMP_LT(ssp->srcu_idx, READ_ONCE(ssp->srcu_idx_max)))
 + 		schedule_work(&ssp->srcu_work);
 + }
 + EXPORT_SYMBOL_GPL(srcu_drive_gp);
 + 
 + static void srcu_gp_start_if_needed(struct srcu_struct *ssp)
 + {
 +-	unsigned short cookie;
 ++	unsigned long cookie;
 + 
 + 	cookie = get_state_synchronize_srcu(ssp);
 +-	if (USHORT_CMP_GE(READ_ONCE(ssp->srcu_idx_max), cookie))
 ++	if (ULONG_CMP_GE(READ_ONCE(ssp->srcu_idx_max), cookie))
 + 		return;
 + 	WRITE_ONCE(ssp->srcu_idx_max, cookie);
 + 	if (!READ_ONCE(ssp->srcu_gp_running)) {
 +@@ -215,7 +215,7 @@ unsigned long get_state_synchronize_srcu(struct srcu_struct *ssp)
 + 	barrier();
 + 	ret = (READ_ONCE(ssp->srcu_idx) + 3) & ~0x1;
 + 	barrier();
 +-	return ret & USHRT_MAX;
 ++	return ret;
 + }
 + EXPORT_SYMBOL_GPL(get_state_synchronize_srcu);
 + 
 +@@ -240,10 +240,10 @@ EXPORT_SYMBOL_GPL(start_poll_synchronize_srcu);
 +  */
 + bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie)
 + {
 +-	bool ret = USHORT_CMP_GE(READ_ONCE(ssp->srcu_idx), cookie);
 ++	unsigned long cur_s = READ_ONCE(ssp->srcu_idx);
 + 
 + 	barrier();
 +-	return ret;
 ++	return ULONG_CMP_GE(cur_s, cookie) || ULONG_CMP_LT(cur_s, cookie - 3);
 + }
 + EXPORT_SYMBOL_GPL(poll_state_synchronize_srcu);
 + 
 +diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
 +index 83c7e6620d40..f5bf6fb430da 100644
 +--- a/kernel/rcu/tasks.h
 ++++ b/kernel/rcu/tasks.h
 +@@ -560,7 +560,7 @@ static int __noreturn rcu_tasks_kthread(void *arg)
 + static void synchronize_rcu_tasks_generic(struct rcu_tasks *rtp)
 + {
 + 	/* Complain if the scheduler has not started.  */
 +-	RCU_LOCKDEP_WARN(rcu_scheduler_active == RCU_SCHEDULER_INACTIVE,
 ++	WARN_ONCE(rcu_scheduler_active == RCU_SCHEDULER_INACTIVE,
 + 			 "synchronize_rcu_tasks called too soon");
 + 
 + 	// If the grace-period kthread is running, use it.
 +@@ -1500,6 +1500,7 @@ static void rcu_tasks_trace_pregp_step(struct list_head *hop)
 + 		if (rcu_tasks_trace_pertask_prep(t, true))
 + 			trc_add_holdout(t, hop);
 + 		rcu_read_unlock();
 ++		cond_resched_tasks_rcu_qs();
 + 	}
 + 
 + 	// Only after all running tasks have been accounted for is it
 +@@ -1520,6 +1521,7 @@ static void rcu_tasks_trace_pregp_step(struct list_head *hop)
 + 			raw_spin_lock_irqsave_rcu_node(rtpcp, flags);
 + 		}
 + 		raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
 ++		cond_resched_tasks_rcu_qs();
 + 	}
 + 
 + 	// Re-enable CPU hotplug now that the holdout list is populated.
 +@@ -1619,6 +1621,7 @@ static void check_all_holdout_tasks_trace(struct list_head *hop,
 + 			trc_del_holdout(t);
 + 		else if (needreport)
 + 			show_stalled_task_trace(t, firstreport);
 ++		cond_resched_tasks_rcu_qs();
 + 	}
 + 
 + 	// Re-enable CPU hotplug now that the holdout list scan has completed.
 +diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
 +index f0561ee16b9c..a33a8d4942c3 100644
 +--- a/kernel/rcu/tiny.c
 ++++ b/kernel/rcu/tiny.c
 +@@ -158,6 +158,10 @@ void synchronize_rcu(void)
 + }
 + EXPORT_SYMBOL_GPL(synchronize_rcu);
 + 
 ++static void tiny_rcu_leak_callback(struct rcu_head *rhp)
 ++{
 ++}
 ++
 + /*
 +  * Post an RCU callback to be invoked after the end of an RCU grace
 +  * period.  But since we have but one CPU, that would be after any
 +@@ -165,9 +169,20 @@ EXPORT_SYMBOL_GPL(synchronize_rcu);
 +  */
 + void call_rcu(struct rcu_head *head, rcu_callback_t func)
 + {
 ++	static atomic_t doublefrees;
 + 	unsigned long flags;
 + 
 +-	debug_rcu_head_queue(head);
 ++	if (debug_rcu_head_queue(head)) {
 ++		if (atomic_inc_return(&doublefrees) < 4) {
 ++			pr_err("%s(): Double-freed CB %p->%pS()!!!  ", __func__, head, head->func);
 ++			mem_dump_obj(head);
 ++		}
 ++
 ++		if (!__is_kvfree_rcu_offset((unsigned long)head->func))
 ++			WRITE_ONCE(head->func, tiny_rcu_leak_callback);
 ++		return;
 ++	}
 ++
 + 	head->func = func;
 + 	head->next = NULL;
 + 
 +@@ -183,6 +198,16 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func)
 + }
 + EXPORT_SYMBOL_GPL(call_rcu);
 + 
 ++/*
 ++ * Store a grace-period-counter "cookie".  For more information,
 ++ * see the Tree RCU header comment.
 ++ */
 ++void get_completed_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp)
 ++{
 ++	rgosp->rgos_norm = RCU_GET_STATE_COMPLETED;
 ++}
 ++EXPORT_SYMBOL_GPL(get_completed_synchronize_rcu_full);
 ++
 + /*
 +  * Return a grace-period-counter "cookie".  For more information,
 +  * see the Tree RCU header comment.
 +diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
 +index 79aea7df4345..6bb8e72bc815 100644
 +--- a/kernel/rcu/tree.c
 ++++ b/kernel/rcu/tree.c
 +@@ -76,6 +76,7 @@
 + /* Data structures. */
 + 
 + static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, rcu_data) = {
 ++	.gpwrap = true,
 + #ifdef CONFIG_RCU_NOCB_CPU
 + 	.cblist.flags = SEGCBLIST_RCU_CORE,
 + #endif
 +@@ -1755,6 +1756,8 @@ static noinline void rcu_gp_cleanup(void)
 + 			dump_blkd_tasks(rnp, 10);
 + 		WARN_ON_ONCE(rnp->qsmask);
 + 		WRITE_ONCE(rnp->gp_seq, new_gp_seq);
 ++		if (!rnp->parent)
 ++			smp_mb(); // Order against failing poll_state_synchronize_rcu_full().
 + 		rdp = this_cpu_ptr(&rcu_data);
 + 		if (rnp == rdp->mynode)
 + 			needgp = __note_gp_changes(rnp, rdp) || needgp;
 +@@ -2341,8 +2344,8 @@ void rcu_sched_clock_irq(int user)
 + 	rcu_flavor_sched_clock_irq(user);
 + 	if (rcu_pending(user))
 + 		invoke_rcu_core();
 +-	if (user)
 +-		rcu_tasks_classic_qs(current, false);
 ++	if (user || rcu_is_cpu_rrupt_from_idle())
 ++		rcu_note_voluntary_context_switch(current);
 + 	lockdep_assert_irqs_disabled();
 + 
 + 	trace_rcu_utilization(TPS("End scheduler-tick"));
 +@@ -2832,7 +2835,7 @@ EXPORT_SYMBOL_GPL(call_rcu);
 + 
 + 
 + /* Maximum number of jiffies to wait before draining a batch. */
 +-#define KFREE_DRAIN_JIFFIES (HZ / 50)
 ++#define KFREE_DRAIN_JIFFIES (5 * HZ)
 + #define KFREE_N_BATCHES 2
 + #define FREE_N_CHANNELS 2
 + 
 +@@ -3093,6 +3096,21 @@ need_offload_krc(struct kfree_rcu_cpu *krcp)
 + 	return !!krcp->head;
 + }
 + 
 ++static void
 ++schedule_delayed_monitor_work(struct kfree_rcu_cpu *krcp)
 ++{
 ++	long delay, delay_left;
 ++
 ++	delay = READ_ONCE(krcp->count) >= KVFREE_BULK_MAX_ENTR ? 1:KFREE_DRAIN_JIFFIES;
 ++	if (delayed_work_pending(&krcp->monitor_work)) {
 ++		delay_left = krcp->monitor_work.timer.expires - jiffies;
 ++		if (delay < delay_left)
 ++			mod_delayed_work(system_wq, &krcp->monitor_work, delay);
 ++		return;
 ++	}
 ++	queue_delayed_work(system_wq, &krcp->monitor_work, delay);
 ++}
 ++
 + /*
 +  * This function is invoked after the KFREE_DRAIN_JIFFIES timeout.
 +  */
 +@@ -3150,7 +3168,7 @@ static void kfree_rcu_monitor(struct work_struct *work)
 + 	// work to repeat an attempt. Because previous batches are
 + 	// still in progress.
 + 	if (need_offload_krc(krcp))
 +-		schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES);
 ++		schedule_delayed_monitor_work(krcp);
 + 
 + 	raw_spin_unlock_irqrestore(&krcp->lock, flags);
 + }
 +@@ -3183,15 +3201,16 @@ static void fill_page_cache_func(struct work_struct *work)
 + 		bnode = (struct kvfree_rcu_bulk_data *)
 + 			__get_free_page(GFP_KERNEL | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
 + 
 +-		if (bnode) {
 +-			raw_spin_lock_irqsave(&krcp->lock, flags);
 +-			pushed = put_cached_bnode(krcp, bnode);
 +-			raw_spin_unlock_irqrestore(&krcp->lock, flags);
 ++		if (!bnode)
 ++			break;
 + 
 +-			if (!pushed) {
 +-				free_page((unsigned long) bnode);
 +-				break;
 +-			}
 ++		raw_spin_lock_irqsave(&krcp->lock, flags);
 ++		pushed = put_cached_bnode(krcp, bnode);
 ++		raw_spin_unlock_irqrestore(&krcp->lock, flags);
 ++
 ++		if (!pushed) {
 ++			free_page((unsigned long) bnode);
 ++			break;
 + 		}
 + 	}
 + 
 +@@ -3338,7 +3357,7 @@ void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
 + 
 + 	// Set timer to drain after KFREE_DRAIN_JIFFIES.
 + 	if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING)
 +-		schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES);
 ++		schedule_delayed_monitor_work(krcp);
 + 
 + unlock_return:
 + 	krc_this_cpu_unlock(krcp, flags);
 +@@ -3371,7 +3390,7 @@ kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
 + 		atomic_set(&krcp->backoff_page_cache_fill, 1);
 + 	}
 + 
 +-	return count;
 ++	return count == 0 ? SHRINK_EMPTY : count;
 + }
 + 
 + static unsigned long
 +@@ -3414,49 +3433,27 @@ void __init kfree_rcu_scheduler_running(void)
 + 
 + 		raw_spin_lock_irqsave(&krcp->lock, flags);
 + 		if (need_offload_krc(krcp))
 +-			schedule_delayed_work_on(cpu, &krcp->monitor_work, KFREE_DRAIN_JIFFIES);
 ++			schedule_delayed_monitor_work(krcp);
 + 		raw_spin_unlock_irqrestore(&krcp->lock, flags);
 + 	}
 + }
 + 
 + /*
 +  * During early boot, any blocking grace-period wait automatically
 +- * implies a grace period.  Later on, this is never the case for PREEMPTION.
 ++ * implies a grace period.
 +  *
 +- * However, because a context switch is a grace period for !PREEMPTION, any
 +- * blocking grace-period wait automatically implies a grace period if
 +- * there is only one CPU online at any point time during execution of
 +- * either synchronize_rcu() or synchronize_rcu_expedited().  It is OK to
 +- * occasionally incorrectly indicate that there are multiple CPUs online
 +- * when there was in fact only one the whole time, as this just adds some
 +- * overhead: RCU still operates correctly.
 ++ * Later on, this could in theory be the case for kernels built with
 ++ * CONFIG_SMP=y && CONFIG_PREEMPTION=y running on a single CPU, but this
 ++ * is not a common case.  Furthermore, this optimization would cause
 ++ * the rcu_gp_oldstate structure to expand by 50%, so this potential
 ++ * grace-period optimization is ignored once the scheduler is running.
 +  */
 + static int rcu_blocking_is_gp(void)
 + {
 +-	int ret;
 +-
 +-	// Invoking preempt_model_*() too early gets a splat.
 +-	if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE ||
 +-	    preempt_model_full() || preempt_model_rt())
 +-		return rcu_scheduler_active == RCU_SCHEDULER_INACTIVE;
 ++	if (rcu_scheduler_active != RCU_SCHEDULER_INACTIVE)
 ++		return false;
 + 	might_sleep();  /* Check for RCU read-side critical section. */
 +-	preempt_disable();
 +-	/*
 +-	 * If the rcu_state.n_online_cpus counter is equal to one,
 +-	 * there is only one CPU, and that CPU sees all prior accesses
 +-	 * made by any CPU that was online at the time of its access.
 +-	 * Furthermore, if this counter is equal to one, its value cannot
 +-	 * change until after the preempt_enable() below.
 +-	 *
 +-	 * Furthermore, if rcu_state.n_online_cpus is equal to one here,
 +-	 * all later CPUs (both this one and any that come online later
 +-	 * on) are guaranteed to see all accesses prior to this point
 +-	 * in the code, without the need for additional memory barriers.
 +-	 * Those memory barriers are provided by CPU-hotplug code.
 +-	 */
 +-	ret = READ_ONCE(rcu_state.n_online_cpus) <= 1;
 +-	preempt_enable();
 +-	return ret;
 ++	return true;
 + }
 + 
 + /**
 +@@ -3499,29 +3496,58 @@ static int rcu_blocking_is_gp(void)
 +  */
 + void synchronize_rcu(void)
 + {
 ++	unsigned long flags;
 ++	struct rcu_node *rnp;
 ++
 + 	RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) ||
 + 			 lock_is_held(&rcu_lock_map) ||
 + 			 lock_is_held(&rcu_sched_lock_map),
 + 			 "Illegal synchronize_rcu() in RCU read-side critical section");
 +-	if (rcu_blocking_is_gp()) {
 +-		// Note well that this code runs with !PREEMPT && !SMP.
 +-		// In addition, all code that advances grace periods runs at
 +-		// process level.  Therefore, this normal GP overlaps with
 +-		// other normal GPs only by being fully nested within them,
 +-		// which allows reuse of ->gp_seq_polled_snap.
 +-		rcu_poll_gp_seq_start_unlocked(&rcu_state.gp_seq_polled_snap);
 +-		rcu_poll_gp_seq_end_unlocked(&rcu_state.gp_seq_polled_snap);
 +-		if (rcu_init_invoked())
 +-			cond_resched_tasks_rcu_qs();
 +-		return;  // Context allows vacuous grace periods.
 ++	if (!rcu_blocking_is_gp()) {
 ++		if (rcu_gp_is_expedited())
 ++			synchronize_rcu_expedited();
 ++		else
 ++			wait_rcu_gp(call_rcu);
 ++		return;
 + 	}
 +-	if (rcu_gp_is_expedited())
 +-		synchronize_rcu_expedited();
 +-	else
 +-		wait_rcu_gp(call_rcu);
 ++
 ++	// Context allows vacuous grace periods.
 ++	// Note well that this code runs with !PREEMPT && !SMP.
 ++	// In addition, all code that advances grace periods runs at
 ++	// process level.  Therefore, this normal GP overlaps with other
 ++	// normal GPs only by being fully nested within them, which allows
 ++	// reuse of ->gp_seq_polled_snap.
 ++	rcu_poll_gp_seq_start_unlocked(&rcu_state.gp_seq_polled_snap);
 ++	rcu_poll_gp_seq_end_unlocked(&rcu_state.gp_seq_polled_snap);
 ++
 ++	// Update the normal grace-period counters to record
 ++	// this grace period, but only those used by the boot CPU.
 ++	// The rcu_scheduler_starting() will take care of the rest of
 ++	// these counters.
 ++	local_irq_save(flags);
 ++	WARN_ON_ONCE(num_online_cpus() > 1);
 ++	rcu_state.gp_seq += (1 << RCU_SEQ_CTR_SHIFT);
 ++	for (rnp = this_cpu_ptr(&rcu_data)->mynode; rnp; rnp = rnp->parent)
 ++		rnp->gp_seq_needed = rnp->gp_seq = rcu_state.gp_seq;
 ++	local_irq_restore(flags);
 + }
 + EXPORT_SYMBOL_GPL(synchronize_rcu);
 + 
 ++/**
 ++ * get_completed_synchronize_rcu_full - Return a full pre-completed polled state cookie
 ++ * @rgosp: Place to put state cookie
 ++ *
 ++ * Stores into @rgosp a value that will always be treated by functions
 ++ * like poll_state_synchronize_rcu_full() as a cookie whose grace period
 ++ * has already completed.
 ++ */
 ++void get_completed_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp)
 ++{
 ++	rgosp->rgos_norm = RCU_GET_STATE_COMPLETED;
 ++	rgosp->rgos_exp = RCU_GET_STATE_COMPLETED;
 ++}
 ++EXPORT_SYMBOL_GPL(get_completed_synchronize_rcu_full);
 ++
 + /**
 +  * get_state_synchronize_rcu - Snapshot current RCU state
 +  *
 +@@ -3541,21 +3567,42 @@ unsigned long get_state_synchronize_rcu(void)
 + EXPORT_SYMBOL_GPL(get_state_synchronize_rcu);
 + 
 + /**
 +- * start_poll_synchronize_rcu - Snapshot and start RCU grace period
 ++ * get_state_synchronize_rcu_full - Snapshot RCU state, both normal and expedited
 ++ * @rgosp: location to place combined normal/expedited grace-period state
 +  *
 +- * Returns a cookie that is used by a later call to cond_synchronize_rcu()
 +- * or poll_state_synchronize_rcu() to determine whether or not a full
 +- * grace period has elapsed in the meantime.  If the needed grace period
 +- * is not already slated to start, notifies RCU core of the need for that
 +- * grace period.
 ++ * Places the normal and expedited grace-period states in @rgosp.  This
 ++ * state value can be passed to a later call to cond_synchronize_rcu_full()
 ++ * or poll_state_synchronize_rcu_full() to determine whether or not a
 ++ * grace period (whether normal or expedited) has elapsed in the meantime.
 ++ * The rcu_gp_oldstate structure takes up twice the memory of an unsigned
 ++ * long, but is guaranteed to see all grace periods.  In contrast, the
 ++ * combined state occupies less memory, but can sometimes fail to take
 ++ * grace periods into account.
 +  *
 +- * Interrupts must be enabled for the case where it is necessary to awaken
 +- * the grace-period kthread.
 ++ * This does not guarantee that the needed grace period will actually
 ++ * start.
 +  */
 +-unsigned long start_poll_synchronize_rcu(void)
 ++void get_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp)
 ++{
 ++	struct rcu_node *rnp = rcu_get_root();
 ++
 ++	/*
 ++	 * Any prior manipulation of RCU-protected data must happen
 ++	 * before the loads from ->gp_seq and ->expedited_sequence.
 ++	 */
 ++	smp_mb();  /* ^^^ */
 ++	rgosp->rgos_norm = rcu_seq_snap(&rnp->gp_seq);
 ++	rgosp->rgos_exp = rcu_seq_snap(&rcu_state.expedited_sequence);
 ++}
 ++EXPORT_SYMBOL_GPL(get_state_synchronize_rcu_full);
 ++
 ++/*
 ++ * Helper function for start_poll_synchronize_rcu() and
 ++ * start_poll_synchronize_rcu_full().
 ++ */
 ++static void start_poll_synchronize_rcu_common(void)
 + {
 + 	unsigned long flags;
 +-	unsigned long gp_seq = get_state_synchronize_rcu();
 + 	bool needwake;
 + 	struct rcu_data *rdp;
 + 	struct rcu_node *rnp;
 +@@ -3575,17 +3622,57 @@ unsigned long start_poll_synchronize_rcu(void)
 + 	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 + 	if (needwake)
 + 		rcu_gp_kthread_wake();
 ++}
 ++
 ++/**
 ++ * start_poll_synchronize_rcu - Snapshot and start RCU grace period
 ++ *
 ++ * Returns a cookie that is used by a later call to cond_synchronize_rcu()
 ++ * or poll_state_synchronize_rcu() to determine whether or not a full
 ++ * grace period has elapsed in the meantime.  If the needed grace period
 ++ * is not already slated to start, notifies RCU core of the need for that
 ++ * grace period.
 ++ *
 ++ * Interrupts must be enabled for the case where it is necessary to awaken
 ++ * the grace-period kthread.
 ++ */
 ++unsigned long start_poll_synchronize_rcu(void)
 ++{
 ++	unsigned long gp_seq = get_state_synchronize_rcu();
 ++
 ++	start_poll_synchronize_rcu_common();
 + 	return gp_seq;
 + }
 + EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu);
 + 
 + /**
 +- * poll_state_synchronize_rcu - Conditionally wait for an RCU grace period
 ++ * start_poll_synchronize_rcu_full - Take a full snapshot and start RCU grace period
 ++ * @rgosp: value from get_state_synchronize_rcu_full() or start_poll_synchronize_rcu_full()
 +  *
 ++ * Places the normal and expedited grace-period states in *@rgos.  This
 ++ * state value can be passed to a later call to cond_synchronize_rcu_full()
 ++ * or poll_state_synchronize_rcu_full() to determine whether or not a
 ++ * grace period (whether normal or expedited) has elapsed in the meantime.
 ++ * If the needed grace period is not already slated to start, notifies
 ++ * RCU core of the need for that grace period.
 ++ *
 ++ * Interrupts must be enabled for the case where it is necessary to awaken
 ++ * the grace-period kthread.
 ++ */
 ++void start_poll_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp)
 ++{
 ++	get_state_synchronize_rcu_full(rgosp);
 ++
 ++	start_poll_synchronize_rcu_common();
 ++}
 ++EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu_full);
 ++
 ++/**
 ++ * poll_state_synchronize_rcu - Has the specified RCU grace period completed?
 +  * @oldstate: value from get_state_synchronize_rcu() or start_poll_synchronize_rcu()
 +  *
 +  * If a full RCU grace period has elapsed since the earlier call from
 +- * which oldstate was obtained, return @true, otherwise return @false.
 ++ * which @oldstate was obtained, return @true, otherwise return @false.
 +  * If @false is returned, it is the caller's responsibility to invoke this
 +  * function later on until it does return @true.  Alternatively, the caller
 +  * can explicitly wait for a grace period, for example, by passing @oldstate
 +@@ -3594,10 +3681,11 @@ EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu);
 +  * Yes, this function does not take counter wrap into account.
 +  * But counter wrap is harmless.  If the counter wraps, we have waited for
 +  * more than a billion grace periods (and way more on a 64-bit system!).
 +- * Those needing to keep oldstate values for very long time periods
 +- * (many hours even on 32-bit systems) should check them occasionally
 +- * and either refresh them or set a flag indicating that the grace period
 +- * has completed.
 ++ * Those needing to keep old state values for very long time periods
 ++ * (many hours even on 32-bit systems) should check them occasionally and
 ++ * either refresh them or set a flag indicating that the grace period has
 ++ * completed.  Alternatively, they can use get_completed_synchronize_rcu()
 ++ * to get a guaranteed-completed grace-period state.
 +  *
 +  * This function provides the same memory-ordering guarantees that
 +  * would be provided by a synchronize_rcu() that was invoked at the call
 +@@ -3616,8 +3704,56 @@ bool poll_state_synchronize_rcu(unsigned long oldstate)
 + EXPORT_SYMBOL_GPL(poll_state_synchronize_rcu);
 + 
 + /**
 +- * cond_synchronize_rcu - Conditionally wait for an RCU grace period
 ++ * poll_state_synchronize_rcu_full - Has the specified RCU grace period completed?
 ++ * @rgosp: value from get_state_synchronize_rcu_full() or start_poll_synchronize_rcu_full()
 +  *
 ++ * If a full RCU grace period has elapsed since the earlier call from
 ++ * which *rgosp was obtained, return @true, otherwise return @false.
 ++ * If @false is returned, it is the caller's responsibility to invoke this
 ++ * function later on until it does return @true.  Alternatively, the caller
 ++ * can explicitly wait for a grace period, for example, by passing @rgosp
 ++ * to cond_synchronize_rcu() or by directly invoking synchronize_rcu().
 ++ *
 ++ * Yes, this function does not take counter wrap into account.
 ++ * But counter wrap is harmless.  If the counter wraps, we have waited
 ++ * for more than a billion grace periods (and way more on a 64-bit
 ++ * system!).  Those needing to keep rcu_gp_oldstate values for very
 ++ * long time periods (many hours even on 32-bit systems) should check
 ++ * them occasionally and either refresh them or set a flag indicating
 ++ * that the grace period has completed.  Alternatively, they can use
 ++ * get_completed_synchronize_rcu_full() to get a guaranteed-completed
 ++ * grace-period state.
 ++ *
 ++ * This function provides the same memory-ordering guarantees that would
 ++ * be provided by a synchronize_rcu() that was invoked at the call to
 ++ * the function that provided @rgosp, and that returned at the end of this
 ++ * function.  And this guarantee requires that the root rcu_node structure's
 ++ * ->gp_seq field be checked instead of that of the rcu_state structure.
 ++ * The problem is that the just-ending grace-period's callbacks can be
 ++ * invoked between the time that the root rcu_node structure's ->gp_seq
 ++ * field is updated and the time that the rcu_state structure's ->gp_seq
 ++ * field is updated.  Therefore, if a single synchronize_rcu() is to
 ++ * cause a subsequent poll_state_synchronize_rcu_full() to return @true,
 ++ * then the root rcu_node structure is the one that needs to be polled.
 ++ */
 ++bool poll_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp)
 ++{
 ++	struct rcu_node *rnp = rcu_get_root();
 ++
 ++	smp_mb(); // Order against root rcu_node structure grace-period cleanup.
 ++	if (rgosp->rgos_norm == RCU_GET_STATE_COMPLETED ||
 ++	    rcu_seq_done_exact(&rnp->gp_seq, rgosp->rgos_norm) ||
 ++	    rgosp->rgos_exp == RCU_GET_STATE_COMPLETED ||
 ++	    rcu_seq_done_exact(&rcu_state.expedited_sequence, rgosp->rgos_exp)) {
 ++		smp_mb(); /* Ensure GP ends before subsequent accesses. */
 ++		return true;
 ++	}
 ++	return false;
 ++}
 ++EXPORT_SYMBOL_GPL(poll_state_synchronize_rcu_full);
 ++
 ++/**
 ++ * cond_synchronize_rcu - Conditionally wait for an RCU grace period
 +  * @oldstate: value from get_state_synchronize_rcu(), start_poll_synchronize_rcu(), or start_poll_synchronize_rcu_expedited()
 +  *
 +  * If a full RCU grace period has elapsed since the earlier call to
 +@@ -3641,6 +3777,33 @@ void cond_synchronize_rcu(unsigned long oldstate)
 + }
 + EXPORT_SYMBOL_GPL(cond_synchronize_rcu);
 + 
 ++/**
 ++ * cond_synchronize_rcu_full - Conditionally wait for an RCU grace period
 ++ * @rgosp: value from get_state_synchronize_rcu_full(), start_poll_synchronize_rcu_full(), or start_poll_synchronize_rcu_expedited_full()
 ++ *
 ++ * If a full RCU grace period has elapsed since the call to
 ++ * get_state_synchronize_rcu_full(), start_poll_synchronize_rcu_full(),
 ++ * or start_poll_synchronize_rcu_expedited_full() from which @rgosp was
 ++ * obtained, just return.  Otherwise, invoke synchronize_rcu() to wait
 ++ * for a full grace period.
 ++ *
 ++ * Yes, this function does not take counter wrap into account.
 ++ * But counter wrap is harmless.  If the counter wraps, we have waited for
 ++ * more than 2 billion grace periods (and way more on a 64-bit system!),
 ++ * so waiting for a couple of additional grace periods should be just fine.
 ++ *
 ++ * This function provides the same memory-ordering guarantees that
 ++ * would be provided by a synchronize_rcu() that was invoked at the call
 ++ * to the function that provided @rgosp and that returned at the end of
 ++ * this function.
 ++ */
 ++void cond_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp)
 ++{
 ++	if (!poll_state_synchronize_rcu_full(rgosp))
 ++		synchronize_rcu();
 ++}
 ++EXPORT_SYMBOL_GPL(cond_synchronize_rcu_full);
 ++
 + /*
 +  * Check to see if there is any immediate RCU-related work to be done by
 +  * the current CPU, returning 1 if so and zero otherwise.  The checks are
 +@@ -4312,9 +4475,20 @@ early_initcall(rcu_spawn_gp_kthread);
 +  */
 + void rcu_scheduler_starting(void)
 + {
 ++	unsigned long flags;
 ++	struct rcu_node *rnp;
 ++
 + 	WARN_ON(num_online_cpus() != 1);
 + 	WARN_ON(nr_context_switches() > 0);
 + 	rcu_test_sync_prims();
 ++
 ++	// Fix up the ->gp_seq counters.
 ++	local_irq_save(flags);
 ++	rcu_for_each_node_breadth_first(rnp)
 ++		rnp->gp_seq_needed = rnp->gp_seq = rcu_state.gp_seq;
 ++	local_irq_restore(flags);
 ++
 ++	// Switch out of early boot mode.
 + 	rcu_scheduler_active = RCU_SCHEDULER_INIT;
 + 	rcu_test_sync_prims();
 + }
 +diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
 +index be667583a554..18e9b4cd78ef 100644
 +--- a/kernel/rcu/tree_exp.h
 ++++ b/kernel/rcu/tree_exp.h
 +@@ -828,11 +828,13 @@ static void rcu_exp_handler(void *unused)
 + {
 + 	struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
 + 	struct rcu_node *rnp = rdp->mynode;
 ++	bool preempt_bh_enabled = !(preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK));
 + 
 + 	if (!(READ_ONCE(rnp->expmask) & rdp->grpmask) ||
 + 	    __this_cpu_read(rcu_data.cpu_no_qs.b.exp))
 + 		return;
 +-	if (rcu_is_cpu_rrupt_from_idle()) {
 ++	if (rcu_is_cpu_rrupt_from_idle() ||
 ++	    (IS_ENABLED(CONFIG_PREEMPT_COUNT) && preempt_bh_enabled)) {
 + 		rcu_report_exp_rdp(this_cpu_ptr(&rcu_data));
 + 		return;
 + 	}
 +@@ -906,6 +908,7 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp)
 + void synchronize_rcu_expedited(void)
 + {
 + 	bool boottime = (rcu_scheduler_active == RCU_SCHEDULER_INIT);
 ++	unsigned long flags;
 + 	struct rcu_exp_work rew;
 + 	struct rcu_node *rnp;
 + 	unsigned long s;
 +@@ -924,8 +927,11 @@ void synchronize_rcu_expedited(void)
 + 		// them, which allows reuse of ->gp_seq_polled_exp_snap.
 + 		rcu_poll_gp_seq_start_unlocked(&rcu_state.gp_seq_polled_exp_snap);
 + 		rcu_poll_gp_seq_end_unlocked(&rcu_state.gp_seq_polled_exp_snap);
 +-		if (rcu_init_invoked())
 +-			cond_resched();
 ++
 ++		local_irq_save(flags);
 ++		WARN_ON_ONCE(num_online_cpus() > 1);
 ++		rcu_state.expedited_sequence += (1 << RCU_SEQ_CTR_SHIFT);
 ++		local_irq_restore(flags);
 + 		return;  // Context allows vacuous grace periods.
 + 	}
 + 
 +@@ -1027,6 +1033,24 @@ unsigned long start_poll_synchronize_rcu_expedited(void)
 + }
 + EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu_expedited);
 + 
 ++/**
 ++ * start_poll_synchronize_rcu_expedited_full - Take a full snapshot and start expedited grace period
 ++ * @rgosp: Place to put snapshot of grace-period state
 ++ *
 ++ * Places the normal and expedited grace-period states in rgosp.  This
 ++ * state value can be passed to a later call to cond_synchronize_rcu_full()
 ++ * or poll_state_synchronize_rcu_full() to determine whether or not a
 ++ * grace period (whether normal or expedited) has elapsed in the meantime.
 ++ * If the needed expedited grace period is not already slated to start,
 ++ * initiates that grace period.
 ++ */
 ++void start_poll_synchronize_rcu_expedited_full(struct rcu_gp_oldstate *rgosp)
 ++{
 ++	get_state_synchronize_rcu_full(rgosp);
 ++	(void)start_poll_synchronize_rcu_expedited();
 ++}
 ++EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu_expedited_full);
 ++
 + /**
 +  * cond_synchronize_rcu_expedited - Conditionally wait for an expedited RCU grace period
 +  *
 +@@ -1053,3 +1077,30 @@ void cond_synchronize_rcu_expedited(unsigned long oldstate)
 + 		synchronize_rcu_expedited();
 + }
 + EXPORT_SYMBOL_GPL(cond_synchronize_rcu_expedited);
 ++
 ++/**
 ++ * cond_synchronize_rcu_expedited_full - Conditionally wait for an expedited RCU grace period
 ++ * @rgosp: value from get_state_synchronize_rcu_full(), start_poll_synchronize_rcu_full(), or start_poll_synchronize_rcu_expedited_full()
 ++ *
 ++ * If a full RCU grace period has elapsed since the call to
 ++ * get_state_synchronize_rcu_full(), start_poll_synchronize_rcu_full(),
 ++ * or start_poll_synchronize_rcu_expedited_full() from which @rgosp was
 ++ * obtained, just return.  Otherwise, invoke synchronize_rcu_expedited()
 ++ * to wait for a full grace period.
 ++ *
 ++ * Yes, this function does not take counter wrap into account.
 ++ * But counter wrap is harmless.  If the counter wraps, we have waited for
 ++ * more than 2 billion grace periods (and way more on a 64-bit system!),
 ++ * so waiting for a couple of additional grace periods should be just fine.
 ++ *
 ++ * This function provides the same memory-ordering guarantees that
 ++ * would be provided by a synchronize_rcu() that was invoked at the call
 ++ * to the function that provided @rgosp and that returned at the end of
 ++ * this function.
 ++ */
 ++void cond_synchronize_rcu_expedited_full(struct rcu_gp_oldstate *rgosp)
 ++{
 ++	if (!poll_state_synchronize_rcu_full(rgosp))
 ++		synchronize_rcu_expedited();
 ++}
 ++EXPORT_SYMBOL_GPL(cond_synchronize_rcu_expedited_full);
 +diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
 +index a8f574d8850d..0a5f0ef41484 100644
 +--- a/kernel/rcu/tree_nocb.h
 ++++ b/kernel/rcu/tree_nocb.h
 +@@ -1111,7 +1111,7 @@ int rcu_nocb_cpu_deoffload(int cpu)
 + 			if (!ret)
 + 				cpumask_clear_cpu(cpu, rcu_nocb_mask);
 + 		} else {
 +-			pr_info("NOCB: Can't CB-deoffload an offline CPU\n");
 ++			pr_info("NOCB: Cannot CB-deoffload offline CPU %d\n", rdp->cpu);
 + 			ret = -EINVAL;
 + 		}
 + 	}
 +@@ -1196,7 +1196,7 @@ int rcu_nocb_cpu_offload(int cpu)
 + 			if (!ret)
 + 				cpumask_set_cpu(cpu, rcu_nocb_mask);
 + 		} else {
 +-			pr_info("NOCB: Can't CB-offload an offline CPU\n");
 ++			pr_info("NOCB: Cannot CB-offload offline CPU %d\n", rdp->cpu);
 + 			ret = -EINVAL;
 + 		}
 + 	}
 +@@ -1452,8 +1452,8 @@ static void show_rcu_nocb_gp_state(struct rcu_data *rdp)
 + 		(long)rdp->nocb_gp_seq,
 + 		rnp->grplo, rnp->grphi, READ_ONCE(rdp->nocb_gp_loops),
 + 		rdp->nocb_gp_kthread ? task_state_to_char(rdp->nocb_gp_kthread) : '.',
 +-		rdp->nocb_cb_kthread ? (int)task_cpu(rdp->nocb_gp_kthread) : -1,
 +-		show_rcu_should_be_on_cpu(rdp->nocb_cb_kthread));
 ++		rdp->nocb_gp_kthread ? (int)task_cpu(rdp->nocb_gp_kthread) : -1,
 ++		show_rcu_should_be_on_cpu(rdp->nocb_gp_kthread));
 + }
 + 
 + /* Dump out nocb kthread state for the specified rcu_data structure. */
 +@@ -1497,7 +1497,7 @@ static void show_rcu_nocb_state(struct rcu_data *rdp)
 + 		".B"[!!rcu_cblist_n_cbs(&rdp->nocb_bypass)],
 + 		rcu_segcblist_n_cbs(&rdp->cblist),
 + 		rdp->nocb_cb_kthread ? task_state_to_char(rdp->nocb_cb_kthread) : '.',
 +-		rdp->nocb_cb_kthread ? (int)task_cpu(rdp->nocb_gp_kthread) : -1,
 ++		rdp->nocb_cb_kthread ? (int)task_cpu(rdp->nocb_cb_kthread) : -1,
 + 		show_rcu_should_be_on_cpu(rdp->nocb_cb_kthread));
 + 
 + 	/* It is OK for GP kthreads to have GP state. */
 +diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
 +index 438ecae6bd7e..e3142ee35fc6 100644
 +--- a/kernel/rcu/tree_plugin.h
 ++++ b/kernel/rcu/tree_plugin.h
 +@@ -641,7 +641,8 @@ static void rcu_read_unlock_special(struct task_struct *t)
 + 
 + 		expboost = (t->rcu_blocked_node && READ_ONCE(t->rcu_blocked_node->exp_tasks)) ||
 + 			   (rdp->grpmask & READ_ONCE(rnp->expmask)) ||
 +-			   IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) ||
 ++			   (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) &&
 ++			   ((rdp->grpmask & READ_ONCE(rnp->qsmask)) || t->rcu_blocked_node)) ||
 + 			   (IS_ENABLED(CONFIG_RCU_BOOST) && irqs_were_disabled &&
 + 			    t->rcu_blocked_node);
 + 		// Need to defer quiescent state until everything is enabled.
 +@@ -718,9 +719,6 @@ static void rcu_flavor_sched_clock_irq(int user)
 + 	struct task_struct *t = current;
 + 
 + 	lockdep_assert_irqs_disabled();
 +-	if (user || rcu_is_cpu_rrupt_from_idle()) {
 +-		rcu_note_voluntary_context_switch(current);
 +-	}
 + 	if (rcu_preempt_depth() > 0 ||
 + 	    (preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK))) {
 + 		/* No QS, force context switch if deferred. */
 +@@ -824,6 +822,7 @@ void rcu_read_unlock_strict(void)
 + 	if (irqs_disabled() || preempt_count() || !rcu_state.gp_kthread)
 + 		return;
 + 	rdp = this_cpu_ptr(&rcu_data);
 ++	rdp->cpu_no_qs.b.norm = false;
 + 	rcu_report_qs_rdp(rdp);
 + 	udelay(rcu_unlock_delay);
 + }
 +@@ -869,7 +868,7 @@ void rcu_all_qs(void)
 + 
 + 	if (!raw_cpu_read(rcu_data.rcu_urgent_qs))
 + 		return;
 +-	preempt_disable();
 ++	preempt_disable();  // For CONFIG_PREEMPT_COUNT=y kernels
 + 	/* Load rcu_urgent_qs before other flags. */
 + 	if (!smp_load_acquire(this_cpu_ptr(&rcu_data.rcu_urgent_qs))) {
 + 		preempt_enable();
 +@@ -931,10 +930,13 @@ static notrace bool rcu_preempt_need_deferred_qs(struct task_struct *t)
 + 	return false;
 + }
 + 
 +-// Except that we do need to respond to a request by an expedited grace
 +-// period for a quiescent state from this CPU.  Note that requests from
 +-// tasks are handled when removing the task from the blocked-tasks list
 +-// below.
 ++// Except that we do need to respond to a request by an expedited
 ++// grace period for a quiescent state from this CPU.  Note that in
 ++// non-preemptible kernels, there can be no context switches within RCU
 ++// read-side critical sections, which in turn means that the leaf rcu_node
 ++// structure's blocked-tasks list is always empty.  is therefore no need to
 ++// actually check it.  Instead, a quiescent state from this CPU suffices,
 ++// and this function is only called from such a quiescent state.
 + notrace void rcu_preempt_deferred_qs(struct task_struct *t)
 + {
 + 	struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
 +@@ -972,7 +974,6 @@ static void rcu_flavor_sched_clock_irq(int user)
 + 		 * neither access nor modify, at least not while the
 + 		 * corresponding CPU is online.
 + 		 */
 +-
 + 		rcu_qs();
 + 	}
 + }
 +@@ -1238,8 +1239,11 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
 + 		    cpu != outgoingcpu)
 + 			cpumask_set_cpu(cpu, cm);
 + 	cpumask_and(cm, cm, housekeeping_cpumask(HK_TYPE_RCU));
 +-	if (cpumask_empty(cm))
 ++	if (cpumask_empty(cm)) {
 + 		cpumask_copy(cm, housekeeping_cpumask(HK_TYPE_RCU));
 ++		if (outgoingcpu >= 0)
 ++			cpumask_clear_cpu(outgoingcpu, cm);
 ++	}
 + 	set_cpus_allowed_ptr(t, cm);
 + 	mutex_unlock(&rnp->boost_kthread_mutex);
 + 	free_cpumask_var(cm);
 +diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
 +index c3fbbcc09327..5653560573e2 100644
 +--- a/kernel/rcu/tree_stall.h
 ++++ b/kernel/rcu/tree_stall.h
 +@@ -368,7 +368,7 @@ static void rcu_dump_cpu_stacks(void)
 + 			if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) {
 + 				if (cpu_is_offline(cpu))
 + 					pr_err("Offline CPU %d blocking current GP.\n", cpu);
 +-				else if (!trigger_single_cpu_backtrace(cpu))
 ++				else
 + 					dump_cpu_task(cpu);
 + 			}
 + 		raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 +@@ -511,8 +511,7 @@ static void rcu_check_gp_kthread_starvation(void)
 + 					pr_err("RCU GP kthread last ran on offline CPU %d.\n", cpu);
 + 				} else  {
 + 					pr_err("Stack dump where RCU GP kthread last ran:\n");
 +-					if (!trigger_single_cpu_backtrace(cpu))
 +-						dump_cpu_task(cpu);
 ++					dump_cpu_task(cpu);
 + 				}
 + 			}
 + 			wake_up_process(gpk);
 +diff --git a/kernel/sched/core.c b/kernel/sched/core.c
 +index c808fe78f207..eb804dbfed0d 100644
 +--- a/kernel/sched/core.c
 ++++ b/kernel/sched/core.c
 +@@ -74,6 +74,7 @@
 + 
 + #include <uapi/linux/sched/types.h>
 + 
 ++#include <asm/irq_regs.h>
 + #include <asm/switch_to.h>
 + #include <asm/tlb.h>
 + 
 +@@ -11204,6 +11205,19 @@ struct cgroup_subsys cpu_cgrp_subsys = {
 + 
 + void dump_cpu_task(int cpu)
 + {
 ++	if (cpu == smp_processor_id() && in_hardirq()) {
 ++		struct pt_regs *regs;
 ++
 ++		regs = get_irq_regs();
 ++		if (regs) {
 ++			show_regs(regs);
 ++			return;
 ++		}
 ++	}
 ++
 ++	if (trigger_single_cpu_backtrace(cpu))
 ++		return;
 ++
 + 	pr_info("Task dump for CPU %d:\n", cpu);
 + 	sched_show_task(cpu_curr(cpu));
 + }
 +diff --git a/kernel/smp.c b/kernel/smp.c
 +index 661d09ae5d6a..06a413987a14 100644
 +--- a/kernel/smp.c
 ++++ b/kernel/smp.c
 +@@ -370,8 +370,7 @@ static bool csd_lock_wait_toolong(struct __call_single_data *csd, u64 ts0, u64 *
 + 	if (cpu >= 0) {
 + 		if (static_branch_unlikely(&csdlock_debug_extended))
 + 			csd_lock_print_extended(csd, cpu);
 +-		if (!trigger_single_cpu_backtrace(cpu))
 +-			dump_cpu_task(cpu);
 ++		dump_cpu_task(cpu);
 + 		if (!cpu_cur_csd) {
 + 			pr_alert("csd: Re-sending CSD lock (#%d) IPI from CPU#%02d to CPU#%02d\n", *bug_id, raw_smp_processor_id(), cpu);
 + 			arch_send_call_function_single_ipi(cpu);
 +-- 
 +2.38.0.rc2