Skip to content

Instantly share code, notes, and snippets.

@kiryl
Created October 8, 2025 14:07
Show Gist options
  • Save kiryl/420bfcf909805ba56a5b3f6f78ea8c15 to your computer and use it in GitHub Desktop.
Save kiryl/420bfcf909805ba56a5b3f6f78ea8c15 to your computer and use it in GitHub Desktop.
diff --git a/fs/inode.c b/fs/inode.c
index ec9339024ac3..52163d28d630 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -482,6 +482,8 @@ EXPORT_SYMBOL(inc_nlink);
static void __address_space_init_once(struct address_space *mapping)
{
xa_init_flags(&mapping->i_pages, XA_FLAGS_LOCK_IRQ | XA_FLAGS_ACCOUNT);
+ seqcount_spinlock_init(&mapping->i_pages_delete_seqcnt,
+ &mapping->i_pages->xa_lock);
init_rwsem(&mapping->i_mmap_rwsem);
INIT_LIST_HEAD(&mapping->i_private_list);
spin_lock_init(&mapping->i_private_lock);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 9e9d7c757efe..a900214f0f3a 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -522,6 +522,7 @@ struct address_space {
struct list_head i_private_list;
struct rw_semaphore i_mmap_rwsem;
void * i_private_data;
+ seqcount_spinlock_t i_pages_delete_seqcnt;
} __attribute__((aligned(sizeof(long)))) __randomize_layout;
/*
* On most architectures that alignment is already the case; but
diff --git a/mm/filemap.c b/mm/filemap.c
index 751838ef05e5..6b9627cf47af 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -138,8 +138,10 @@ static void page_cache_delete(struct address_space *mapping,
VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
+ write_seqcount_begin(&mapping->i_pages_delete_seqcnt);
xas_store(&xas, shadow);
xas_init_marks(&xas);
+ write_seqcount_end(&mapping->i_pages_delete_seqcnt);
folio->mapping = NULL;
/* Leave folio->index set: truncation lookup relies upon it */
@@ -2659,6 +2661,59 @@ static void filemap_end_dropbehind_read(struct folio *folio)
}
}
+static inline unsigned long filemap_fast_read(struct address_space *mapping,
+ loff_t pos, char *buffer,
+ size_t size)
+{
+ XA_STATE(xas, &mapping->i_pages, pos >> PAGE_SHIFT);
+ struct folio *folio;
+ loff_t file_size;
+ unsigned int seq;
+
+ lockdep_assert_in_rcu_read_lock();
+
+ /* Give up and go to slow path if raced with page_cache_delete() */
+ if (!raw_seqcount_try_begin(&mapping->i_pages_delete_seqcnt, seq))
+ return 0;
+
+ xas_reset(&xas);
+ folio = xas_load(&xas);
+ if (xas_retry(&xas, folio))
+ return 0;
+
+ if (!folio || xa_is_value(folio))
+ return 0;
+
+ if (!folio_test_uptodate(folio))
+ return 0;
+
+ /* No fast-case if readahead is supposed to started */
+ if (folio_test_readahead(folio))
+ return 0;
+ /* .. or mark it accessed */
+ if (!folio_test_referenced(folio))
+ return 0;
+
+ /* i_size check must be after folio_test_uptodate() */
+ file_size = i_size_read(mapping->host);
+ if (unlikely(pos >= file_size))
+ return 0;
+ if (size > file_size - pos)
+ size = file_size - pos;
+
+ /* Do the data copy */
+ if (memcpy_from_file_folio(buffer, folio, pos, size) != size) {
+ /* No partial reads */
+ return 0;
+ }
+
+ /* Give up and go to slow path if raced with page_cache_delete() */
+ if (read_seqcount_retry(&mapping->i_pages_delete_seqcnt, seq))
+ return 0;
+
+ return size;
+}
+
/**
* filemap_read - Read data from the page cache.
* @iocb: The iocb to read.
@@ -2679,7 +2734,10 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
struct file_ra_state *ra = &filp->f_ra;
struct address_space *mapping = filp->f_mapping;
struct inode *inode = mapping->host;
- struct folio_batch fbatch;
+ union {
+ struct folio_batch fbatch;
+ __DECLARE_FLEX_ARRAY(char, buffer);
+ } area __uninitialized;
int i, error = 0;
bool writably_mapped;
loff_t isize, end_offset;
@@ -2693,7 +2751,53 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
return 0;
iov_iter_truncate(iter, inode->i_sb->s_maxbytes - iocb->ki_pos);
- folio_batch_init(&fbatch);
+
+ /* Don't bother with flush_dcache_folio() */
+ if (ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE)
+ goto slowpath;
+
+ /*
+ * Try a quick lockless read into the 'area' union. Note that
+ * this union is intentionally marked "__uninitialized", because
+ * any compiler initialization would be pointless since this
+ * can fill it will garbage.
+ */
+ do {
+ size_t to_read, read, copied;
+
+ to_read = min(iov_iter_count(iter), sizeof(area));
+
+ /* Let's see if we can just do the read under RCU */
+ rcu_read_lock();
+ read = filemap_fast_read(mapping, iocb->ki_pos, area.buffer, to_read);
+ rcu_read_unlock();
+
+ if (!read)
+ break;
+
+ copied = copy_to_iter(area.buffer, read, iter);
+
+ already_read += copied;
+ iocb->ki_pos += copied;
+ last_pos = iocb->ki_pos;
+
+ if (copied < read) {
+ error = -EFAULT;
+ goto out;
+ }
+
+ /* filemap_fast_read() only reads short at EOF: Stop. */
+ if (read != to_read)
+ goto out;
+ } while (iov_iter_count(iter));
+
+ if (!iov_iter_count(iter))
+ goto out;
+slowpath:
+ /*
+ * This actually properly initializes the fbatch for the slow case
+ */
+ folio_batch_init(&area.fbatch);
do {
cond_resched();
@@ -2709,7 +2813,7 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
if (unlikely(iocb->ki_pos >= i_size_read(inode)))
break;
- error = filemap_get_pages(iocb, iter->count, &fbatch, false);
+ error = filemap_get_pages(iocb, iter->count, &area.fbatch, false);
if (error < 0)
break;
@@ -2737,11 +2841,11 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
* mark it as accessed the first time.
*/
if (!pos_same_folio(iocb->ki_pos, last_pos - 1,
- fbatch.folios[0]))
- folio_mark_accessed(fbatch.folios[0]);
+ area.fbatch.folios[0]))
+ folio_mark_accessed(area.fbatch.folios[0]);
- for (i = 0; i < folio_batch_count(&fbatch); i++) {
- struct folio *folio = fbatch.folios[i];
+ for (i = 0; i < folio_batch_count(&area.fbatch); i++) {
+ struct folio *folio = area.fbatch.folios[i];
size_t fsize = folio_size(folio);
size_t offset = iocb->ki_pos & (fsize - 1);
size_t bytes = min_t(loff_t, end_offset - iocb->ki_pos,
@@ -2772,15 +2876,15 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
}
}
put_folios:
- for (i = 0; i < folio_batch_count(&fbatch); i++) {
- struct folio *folio = fbatch.folios[i];
+ for (i = 0; i < folio_batch_count(&area.fbatch); i++) {
+ struct folio *folio = area.fbatch.folios[i];
filemap_end_dropbehind_read(folio);
folio_put(folio);
}
- folio_batch_init(&fbatch);
+ folio_batch_init(&area.fbatch);
} while (iov_iter_count(iter) && iocb->ki_pos < isize && !error);
-
+out:
file_accessed(filp);
ra->prev_pos = last_pos;
return already_read ? already_read : error;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment