108 lines
4.4 KiB
Diff
108 lines
4.4 KiB
Diff
|
From f6f5cd12972307324de5decd7fa41b0b3c98639c Mon Sep 17 00:00:00 2001
|
||
|
From: Boris Burkov <boris@bur.io>
|
||
|
Date: Fri, 18 Oct 2024 15:44:34 -0700
|
||
|
Subject: btrfs: fix read corruption due to race with extent map merging
|
||
|
|
||
|
In debugging some corrupt squashfs files, we observed symptoms of
|
||
|
corrupt page cache pages but correct on-disk contents. Further
|
||
|
investigation revealed that the exact symptom was a correct page
|
||
|
followed by an incorrect, duplicate, page. This got us thinking about
|
||
|
extent maps.
|
||
|
|
||
|
commit ac05ca913e9f ("Btrfs: fix race between using extent maps and merging them")
|
||
|
enforces a reference count on the primary `em` extent_map being merged,
|
||
|
as that one gets modified.
|
||
|
|
||
|
However, since,
|
||
|
commit 3d2ac9922465 ("btrfs: introduce new members for extent_map")
|
||
|
both 'em' and 'merge' get modified, which started modifying 'merge'
|
||
|
and thus introduced the same race.
|
||
|
|
||
|
We were able to reproduce this by looping the affected squashfs workload
|
||
|
in parallel on a bunch of separate btrfs-es while also dropping caches.
|
||
|
We are still working on a simple enough reproducer to make into an fstest.
|
||
|
|
||
|
The simplest fix is to stop modifying 'merge', which is not essential,
|
||
|
as it is dropped immediately after the merge. This behavior is simply
|
||
|
a consequence of the order of the two extent maps being important in
|
||
|
computing the new values. Modify merge_ondisk_extents to take prev and
|
||
|
next by const* and also take a third merged parameter that it puts the
|
||
|
results in. Note that this introduces the rather odd behavior of passing
|
||
|
'em' to merge_ondisk_extents as a const * and as a regular ptr.
|
||
|
|
||
|
Fixes: 3d2ac9922465 ("btrfs: introduce new members for extent_map")
|
||
|
CC: stable@vger.kernel.org # 6.11+
|
||
|
Reviewed-by: Qu Wenruo <wqu@suse.com>
|
||
|
Reviewed-by: Filipe Manana <fdmanana@suse.com>
|
||
|
Signed-off-by: Omar Sandoval <osandov@fb.com>
|
||
|
Signed-off-by: Boris Burkov <boris@bur.io>
|
||
|
Signed-off-by: David Sterba <dsterba@suse.com>
|
||
|
---
|
||
|
fs/btrfs/extent_map.c | 31 ++++++++++++++++---------------
|
||
|
1 file changed, 16 insertions(+), 15 deletions(-)
|
||
|
|
||
|
--- a/fs/btrfs/extent_map.c
|
||
|
+++ b/fs/btrfs/extent_map.c
|
||
|
@@ -240,13 +240,19 @@ static bool mergeable_maps(const struct
|
||
|
/*
|
||
|
* Handle the on-disk data extents merge for @prev and @next.
|
||
|
*
|
||
|
+ * @prev: left extent to merge
|
||
|
+ * @next: right extent to merge
|
||
|
+ * @merged: the extent we will not discard after the merge; updated with new values
|
||
|
+ *
|
||
|
+ * After this, one of the two extents is the new merged extent and the other is
|
||
|
+ * removed from the tree and likely freed. Note that @merged is one of @prev/@next
|
||
|
+ * so there is const/non-const aliasing occurring here.
|
||
|
+ *
|
||
|
* Only touches disk_bytenr/disk_num_bytes/offset/ram_bytes.
|
||
|
* For now only uncompressed regular extent can be merged.
|
||
|
- *
|
||
|
- * @prev and @next will be both updated to point to the new merged range.
|
||
|
- * Thus one of them should be removed by the caller.
|
||
|
*/
|
||
|
-static void merge_ondisk_extents(struct extent_map *prev, struct extent_map *next)
|
||
|
+static void merge_ondisk_extents(const struct extent_map *prev, const struct extent_map *next,
|
||
|
+ struct extent_map *merged)
|
||
|
{
|
||
|
u64 new_disk_bytenr;
|
||
|
u64 new_disk_num_bytes;
|
||
|
@@ -281,15 +287,10 @@ static void merge_ondisk_extents(struct
|
||
|
new_disk_bytenr;
|
||
|
new_offset = prev->disk_bytenr + prev->offset - new_disk_bytenr;
|
||
|
|
||
|
- prev->disk_bytenr = new_disk_bytenr;
|
||
|
- prev->disk_num_bytes = new_disk_num_bytes;
|
||
|
- prev->ram_bytes = new_disk_num_bytes;
|
||
|
- prev->offset = new_offset;
|
||
|
-
|
||
|
- next->disk_bytenr = new_disk_bytenr;
|
||
|
- next->disk_num_bytes = new_disk_num_bytes;
|
||
|
- next->ram_bytes = new_disk_num_bytes;
|
||
|
- next->offset = new_offset;
|
||
|
+ merged->disk_bytenr = new_disk_bytenr;
|
||
|
+ merged->disk_num_bytes = new_disk_num_bytes;
|
||
|
+ merged->ram_bytes = new_disk_num_bytes;
|
||
|
+ merged->offset = new_offset;
|
||
|
}
|
||
|
|
||
|
static void dump_extent_map(struct btrfs_fs_info *fs_info, const char *prefix,
|
||
|
@@ -358,7 +359,7 @@ static void try_merge_map(struct btrfs_i
|
||
|
em->generation = max(em->generation, merge->generation);
|
||
|
|
||
|
if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE)
|
||
|
- merge_ondisk_extents(merge, em);
|
||
|
+ merge_ondisk_extents(merge, em, em);
|
||
|
em->flags |= EXTENT_FLAG_MERGED;
|
||
|
|
||
|
validate_extent_map(fs_info, em);
|
||
|
@@ -375,7 +376,7 @@ static void try_merge_map(struct btrfs_i
|
||
|
if (rb && can_merge_extent_map(merge) && mergeable_maps(em, merge)) {
|
||
|
em->len += merge->len;
|
||
|
if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE)
|
||
|
- merge_ondisk_extents(em, merge);
|
||
|
+ merge_ondisk_extents(em, merge, em);
|
||
|
validate_extent_map(fs_info, em);
|
||
|
rb_erase(&merge->rb_node, &tree->root);
|
||
|
RB_CLEAR_NODE(&merge->rb_node);
|