Created
October 8, 2025 14:07
-
-
Save kiryl/420bfcf909805ba56a5b3f6f78ea8c15 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| diff --git a/fs/inode.c b/fs/inode.c | |
| index ec9339024ac3..52163d28d630 100644 | |
| --- a/fs/inode.c | |
| +++ b/fs/inode.c | |
| @@ -482,6 +482,8 @@ EXPORT_SYMBOL(inc_nlink); | |
| static void __address_space_init_once(struct address_space *mapping) | |
| { | |
| xa_init_flags(&mapping->i_pages, XA_FLAGS_LOCK_IRQ | XA_FLAGS_ACCOUNT); | |
| + seqcount_spinlock_init(&mapping->i_pages_delete_seqcnt, | |
| + &mapping->i_pages->xa_lock); | |
| init_rwsem(&mapping->i_mmap_rwsem); | |
| INIT_LIST_HEAD(&mapping->i_private_list); | |
| spin_lock_init(&mapping->i_private_lock); | |
| diff --git a/include/linux/fs.h b/include/linux/fs.h | |
| index 9e9d7c757efe..a900214f0f3a 100644 | |
| --- a/include/linux/fs.h | |
| +++ b/include/linux/fs.h | |
| @@ -522,6 +522,7 @@ struct address_space { | |
| struct list_head i_private_list; | |
| struct rw_semaphore i_mmap_rwsem; | |
| void * i_private_data; | |
| + seqcount_spinlock_t i_pages_delete_seqcnt; | |
| } __attribute__((aligned(sizeof(long)))) __randomize_layout; | |
| /* | |
| * On most architectures that alignment is already the case; but | |
| diff --git a/mm/filemap.c b/mm/filemap.c | |
| index 751838ef05e5..6b9627cf47af 100644 | |
| --- a/mm/filemap.c | |
| +++ b/mm/filemap.c | |
| @@ -138,8 +138,10 @@ static void page_cache_delete(struct address_space *mapping, | |
| VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); | |
| + write_seqcount_begin(&mapping->i_pages_delete_seqcnt); | |
| xas_store(&xas, shadow); | |
| xas_init_marks(&xas); | |
| + write_seqcount_end(&mapping->i_pages_delete_seqcnt); | |
| folio->mapping = NULL; | |
| /* Leave folio->index set: truncation lookup relies upon it */ | |
| @@ -2659,6 +2661,59 @@ static void filemap_end_dropbehind_read(struct folio *folio) | |
| } | |
| } | |
| +static inline unsigned long filemap_fast_read(struct address_space *mapping, | |
| + loff_t pos, char *buffer, | |
| + size_t size) | |
| +{ | |
| + XA_STATE(xas, &mapping->i_pages, pos >> PAGE_SHIFT); | |
| + struct folio *folio; | |
| + loff_t file_size; | |
| + unsigned int seq; | |
| + | |
| + lockdep_assert_in_rcu_read_lock(); | |
| + | |
| + /* Give up and go to slow path if raced with page_cache_delete() */ | |
| + if (!raw_seqcount_try_begin(&mapping->i_pages_delete_seqcnt, seq)) | |
| + return 0; | |
| + | |
| + xas_reset(&xas); | |
| + folio = xas_load(&xas); | |
| + if (xas_retry(&xas, folio)) | |
| + return 0; | |
| + | |
| + if (!folio || xa_is_value(folio)) | |
| + return 0; | |
| + | |
| + if (!folio_test_uptodate(folio)) | |
| + return 0; | |
| + | |
| + /* No fast-case if readahead is supposed to started */ | |
| + if (folio_test_readahead(folio)) | |
| + return 0; | |
| + /* .. or mark it accessed */ | |
| + if (!folio_test_referenced(folio)) | |
| + return 0; | |
| + | |
| + /* i_size check must be after folio_test_uptodate() */ | |
| + file_size = i_size_read(mapping->host); | |
| + if (unlikely(pos >= file_size)) | |
| + return 0; | |
| + if (size > file_size - pos) | |
| + size = file_size - pos; | |
| + | |
| + /* Do the data copy */ | |
| + if (memcpy_from_file_folio(buffer, folio, pos, size) != size) { | |
| + /* No partial reads */ | |
| + return 0; | |
| + } | |
| + | |
| + /* Give up and go to slow path if raced with page_cache_delete() */ | |
| + if (read_seqcount_retry(&mapping->i_pages_delete_seqcnt, seq)) | |
| + return 0; | |
| + | |
| + return size; | |
| +} | |
| + | |
| /** | |
| * filemap_read - Read data from the page cache. | |
| * @iocb: The iocb to read. | |
| @@ -2679,7 +2734,10 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter, | |
| struct file_ra_state *ra = &filp->f_ra; | |
| struct address_space *mapping = filp->f_mapping; | |
| struct inode *inode = mapping->host; | |
| - struct folio_batch fbatch; | |
| + union { | |
| + struct folio_batch fbatch; | |
| + __DECLARE_FLEX_ARRAY(char, buffer); | |
| + } area __uninitialized; | |
| int i, error = 0; | |
| bool writably_mapped; | |
| loff_t isize, end_offset; | |
| @@ -2693,7 +2751,53 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter, | |
| return 0; | |
| iov_iter_truncate(iter, inode->i_sb->s_maxbytes - iocb->ki_pos); | |
| - folio_batch_init(&fbatch); | |
| + | |
| + /* Don't bother with flush_dcache_folio() */ | |
| + if (ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE) | |
| + goto slowpath; | |
| + | |
| + /* | |
| + * Try a quick lockless read into the 'area' union. Note that | |
| + * this union is intentionally marked "__uninitialized", because | |
| + * any compiler initialization would be pointless since this | |
| + * can fill it will garbage. | |
| + */ | |
| + do { | |
| + size_t to_read, read, copied; | |
| + | |
| + to_read = min(iov_iter_count(iter), sizeof(area)); | |
| + | |
| + /* Let's see if we can just do the read under RCU */ | |
| + rcu_read_lock(); | |
| + read = filemap_fast_read(mapping, iocb->ki_pos, area.buffer, to_read); | |
| + rcu_read_unlock(); | |
| + | |
| + if (!read) | |
| + break; | |
| + | |
| + copied = copy_to_iter(area.buffer, read, iter); | |
| + | |
| + already_read += copied; | |
| + iocb->ki_pos += copied; | |
| + last_pos = iocb->ki_pos; | |
| + | |
| + if (copied < read) { | |
| + error = -EFAULT; | |
| + goto out; | |
| + } | |
| + | |
| + /* filemap_fast_read() only reads short at EOF: Stop. */ | |
| + if (read != to_read) | |
| + goto out; | |
| + } while (iov_iter_count(iter)); | |
| + | |
| + if (!iov_iter_count(iter)) | |
| + goto out; | |
| +slowpath: | |
| + /* | |
| + * This actually properly initializes the fbatch for the slow case | |
| + */ | |
| + folio_batch_init(&area.fbatch); | |
| do { | |
| cond_resched(); | |
| @@ -2709,7 +2813,7 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter, | |
| if (unlikely(iocb->ki_pos >= i_size_read(inode))) | |
| break; | |
| - error = filemap_get_pages(iocb, iter->count, &fbatch, false); | |
| + error = filemap_get_pages(iocb, iter->count, &area.fbatch, false); | |
| if (error < 0) | |
| break; | |
| @@ -2737,11 +2841,11 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter, | |
| * mark it as accessed the first time. | |
| */ | |
| if (!pos_same_folio(iocb->ki_pos, last_pos - 1, | |
| - fbatch.folios[0])) | |
| - folio_mark_accessed(fbatch.folios[0]); | |
| + area.fbatch.folios[0])) | |
| + folio_mark_accessed(area.fbatch.folios[0]); | |
| - for (i = 0; i < folio_batch_count(&fbatch); i++) { | |
| - struct folio *folio = fbatch.folios[i]; | |
| + for (i = 0; i < folio_batch_count(&area.fbatch); i++) { | |
| + struct folio *folio = area.fbatch.folios[i]; | |
| size_t fsize = folio_size(folio); | |
| size_t offset = iocb->ki_pos & (fsize - 1); | |
| size_t bytes = min_t(loff_t, end_offset - iocb->ki_pos, | |
| @@ -2772,15 +2876,15 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter, | |
| } | |
| } | |
| put_folios: | |
| - for (i = 0; i < folio_batch_count(&fbatch); i++) { | |
| - struct folio *folio = fbatch.folios[i]; | |
| + for (i = 0; i < folio_batch_count(&area.fbatch); i++) { | |
| + struct folio *folio = area.fbatch.folios[i]; | |
| filemap_end_dropbehind_read(folio); | |
| folio_put(folio); | |
| } | |
| - folio_batch_init(&fbatch); | |
| + folio_batch_init(&area.fbatch); | |
| } while (iov_iter_count(iter) && iocb->ki_pos < isize && !error); | |
| - | |
| +out: | |
| file_accessed(filp); | |
| ra->prev_pos = last_pos; | |
| return already_read ? already_read : error; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment